Source code for nnmnkwii.io.hts

# Part of code here is adapted from Merlin. Their license follows:
################################################################################
#           The Neural Network (NN) based Speech Synthesis System
#                https://github.com/CSTR-Edinburgh/merlin
#
#                Centre for Speech Technology Research
#                     University of Edinburgh, UK
#                      Copyright (c) 2014-2015
#                        All Rights Reserved.
#
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
#  Permission is hereby granted, free of charge, to use and distribute
#  this software and its documentation without restriction, including
#  without limitation the rights to use, copy, modify, merge, publish,
#  distribute, sublicense, and/or sell copies of this work, and to
#  permit persons to whom this work is furnished to do so, subject to
#  the following conditions:
#
#   - Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#   - Redistributions in binary form must reproduce the above
#     copyright notice, this list of conditions and the following
#     disclaimer in the documentation and/or other materials provided
#     with the distribution.
#   - The authors' names may not be used to endorse or promote products derived
#     from this software without specific prior written permission.
#
#  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK
#  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
#  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
#  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE
#  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
#  THIS SOFTWARE.
################################################################################

from __future__ import division, print_function, absolute_import

import numpy as np
import re

# TODO: consider two label alignmetn format


[docs]class HTSLabelFile(object):
    """Memory representation for HTS-style context labels file.

    Indexing is supported. It returns tuple of
    (``start_time``, ``end_time``, ``label``).

    Attributes:
        frame_shift_in_ms (int): Frame shift in micro seconds
        start_times (ndarray): Start times
        end_times (ndarray): End times
        contexts (nadarray): Contexts.

    Examples:
        >>> from nnmnkwii.io import hts
        >>> from nnmnkwii.util import example_label_file
        >>> labels = hts.load(example_label_file())
        >>> print(labels[0])
        (0, 50000, 'x^x-sil+hh=iy@x_x/A:0_0_0/B:x-x-x@x-x&x-x#x-x$x-x!x-x;x-x|x\
/C:1+1+2/D:0_0/E:x+x@x+x&x+x#x+x/F:content_1/G:0_0/H:x=x@1=2|0/I:4=3/\
J:13+9-2[2]')
    """

    def __init__(self, frame_shift_in_micro_sec=50000):
        self.start_times = []
        self.end_times = []
        self.contexts = []
        self.frame_shift_in_micro_sec = frame_shift_in_micro_sec

    def __len__(self):
        return len(self.start_times)

    def __getitem__(self, idx):
        return self.start_times[idx], self.end_times[idx], self.contexts[idx]

    def __str__(self):
        ret = ""
        for s, e, context in self:
            ret += "{} {} {}\n".format(s, e, context)
        return ret

    def __repr__(self):
        return str(self)

[docs]    def set_durations(self, durations):
        """Set start/end times from duration features

        TODO:
            this should be refactored
        """
        # Unwrap state-axis
        end_times = np.cumsum(
            durations.reshape(-1, 1) * self.frame_shift_in_micro_sec).astype(np.int)
        if len(end_times) != len(self.end_times):
            raise RuntimeError("Unexpected input, maybe")
        # Assuming first label starts with time `0`
        # Is this really true? probably no
        start_times = np.hstack((0, end_times[:-1])).astype(np.int)
        self.start_times, self.end_times = start_times, end_times

[docs]    def load(self, path):
        """Load labels from file

        Args:
            path (str): File path
        """
        with open(path) as f:
            lines = f.readlines()

        start_times = np.empty(len(lines), dtype=np.int)
        end_times = np.empty(len(lines), dtype=np.int)
        contexts = []
        # TODO: consider comments?
        for idx, line in enumerate(lines):
            start_time, end_time, context = line[:-1].split(" ")
            start_times[idx] = int(start_time)
            end_times[idx] = int(end_time)
            contexts.append(context)

        self.start_times = start_times
        self.end_times = end_times
        self.contexts = np.array(contexts)

[docs]    def silence_label_indices(self, regex=None):
        """Returns silence label indices

        Args:
            regex (re(optional)): Compiled regex to find silence labels.

        Returns:
            1darray: Silence label indices
        """
        if regex is None:
            regex = re.compile(".*-sil+.*")
        return np.where(list(map(regex.match, self.contexts)))[0]

[docs]    def silence_phone_indices(self, regex=None):
        """Returns phone-level frame indices

        Args:
            regex (re(optional)): Compiled regex to find silence labels.

        Returns:
            1darray: Silence label indices
        """
        if regex is None:
            regex = re.compile(".*-sil+.*")
        state_number = 5  # TODO
        return np.unique(self.silence_label_indices(regex) // state_number)

[docs]    def silence_frame_indices(self, regex=None):
        """Returns silence frame indices

        Similar to :func:`silence_label_indices`, but returns indices in frame-level.

        Args:
            regex (re(optional)): Compiled regex to find silence labels.

        Returns:
            1darray: Silence frame indices
        """
        if regex is None:
            regex = re.compile(".*-sil+.*")
        indices = self.silence_label_indices(regex)
        if len(indices) == 0:
            return np.empty(0)
        s = self.start_times[indices] // self.frame_shift_in_micro_sec
        e = self.end_times[indices] // self.frame_shift_in_micro_sec
        return np.unique(np.concatenate(
            [np.arange(a, b) for (a, b) in zip(s, e)], axis=0)).astype(np.int)

    def is_state_alignment_label(self):
        return self.contexts[0][-1] == ']' and self.contexts[0][-3] == '['

[docs]    def num_states(self):
        """Returnes number of states exclusing special begin/end states.
        """
        if not self.is_state_alignment_label():
            return 1

        assert len(self) > 0
        initial_state_num = int(self.contexts[0][-2])
        largest_state_num = initial_state_num
        for label in self.contexts[1:]:
            n = int(label[-2])
            if n > largest_state_num:
                largest_state_num = n
            else:
                break
        return largest_state_num - initial_state_num + 1

    def num_phones(self):
        if self.is_state_alignment_label():
            return len(self) // self.num_states()
        else:
            return len(self)

    def num_frames(self):
        return self.end_times[-1] // self.frame_shift_in_micro_sec


[docs]def load(path, frame_shift_in_micro_sec=50000):
    """Load HTS-style label file

    Args:
        path (str): Path of file.
        frame_shift_in_micro_sec (optional[int]): Frame shift in micro seconds.
            Default is 50000.

    Returns:
        labels (HTSLabelFile): Instance of HTSLabelFile.

    Examples:
        >>> from nnmnkwii.io import hts
        >>> from nnmnkwii.util import example_label_file
        >>> labels = hts.load(example_label_file())
    """
    labels = HTSLabelFile(frame_shift_in_micro_sec)
    labels.load(path)

    return labels


def wildcards2regex(question, convert_number_pattern=False):
    """subphone_features
    Convert HTK-style question into regular expression for searching labels.
    If convert_number_pattern, keep the following sequences unescaped for
    extracting continuous values):
    (\d+)       -- handles digit without decimal point
    ([\d\.]+)   -- handles digits with and without decimal point
    """

    # handle HTK wildcards (and lack of them) at ends of label:
    if '*' in question:
        if not question.startswith('*'):
            question = '\A' + question
        if not question.endswith('*'):
            question = question + '\Z'
    question = question.strip('*')
    question = re.escape(question)
    # convert remaining HTK wildcards * and ? to equivalent regex:
    question = question.replace('\\*', '.*')

    if convert_number_pattern:
        question = question.replace('\\(\\\\d\\+\\)', '(\d+)')
        question = question.replace(
            '\\(\\[\\\\d\\\\\\.\\]\\+\\)', '([\d\.]+)')
    return question


[docs]def load_question_set(qs_file_name):
    """Load HTS-style question and convert it to binary/continuous feature
    extraction regexes.

    This code was taken from Merin.

    Args:
        qs_file_name (str): Input HTS-style question file path

    Returns:
        (binary_dict, continuous_dict): Binary/continuous feature extraction
        regexes.

    Examples:
        >>> from nnmnkwii.io import hts
        >>> from nnmnkwii.util import example_question_file
        >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file())
    """
    with open(qs_file_name) as f:
        lines = f.readlines()
    binary_qs_index = 0
    continuous_qs_index = 0
    binary_dict = {}
    continuous_dict = {}
    LL = re.compile(re.escape('LL-'))

    for line in lines:
        line = line.replace('\n', '')

        if len(line) > 5:
            temp_list = line.split('{')
            temp_line = temp_list[1]
            temp_list = temp_line.split('}')
            temp_line = temp_list[0]
            temp_line = temp_line.strip()
            question_list = temp_line.split(',')

            temp_list = line.split(' ')
            question_key = temp_list[1]
            if temp_list[0] == 'CQS':
                assert len(question_list) == 1
                processed_question = wildcards2regex(
                    question_list[0], convert_number_pattern=True)
                continuous_dict[str(continuous_qs_index)] = re.compile(
                    processed_question)  # save pre-compiled regular expression
                continuous_qs_index = continuous_qs_index + 1
            elif temp_list[0] == 'QS':
                re_list = []
                for temp_question in question_list:
                    processed_question = wildcards2regex(temp_question)
                    if LL.search(question_key):
                        processed_question = '^' + processed_question
                    re_list.append(re.compile(processed_question))

                binary_dict[str(binary_qs_index)] = re_list
                binary_qs_index = binary_qs_index + 1
            else:
                raise RuntimeError("Not supported question format")
    return binary_dict, continuous_dict