Source code for nnmnkwii.io.hts

# Part of code here is adapted from Merlin. Their license follows:
################################################################################
#           The Neural Network (NN) based Speech Synthesis System
#                https://github.com/CSTR-Edinburgh/merlin
#
#                Centre for Speech Technology Research
#                     University of Edinburgh, UK
#                      Copyright (c) 2014-2015
#                        All Rights Reserved.
#
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
#  Permission is hereby granted, free of charge, to use and distribute
#  this software and its documentation without restriction, including
#  without limitation the rights to use, copy, modify, merge, publish,
#  distribute, sublicense, and/or sell copies of this work, and to
#  permit persons to whom this work is furnished to do so, subject to
#  the following conditions:
#
#   - Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#   - Redistributions in binary form must reproduce the above
#     copyright notice, this list of conditions and the following
#     disclaimer in the documentation and/or other materials provided
#     with the distribution.
#   - The authors' names may not be used to endorse or promote products derived
#     from this software without specific prior written permission.
#
#  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK
#  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
#  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
#  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE
#  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
#  THIS SOFTWARE.
################################################################################

from __future__ import division, print_function, absolute_import

import numpy as np
import re

# TODO: consider two label alignmetn format


[docs]class HTSLabelFile(object): """Memory representation for HTS-style context labels file. Indexing is supported. It returns tuple of (``start_time``, ``end_time``, ``label``). Attributes: frame_shift_in_ms (int): Frame shift in micro seconds start_times (ndarray): Start times end_times (ndarray): End times contexts (nadarray): Contexts. Examples: >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_label_file >>> labels = hts.load(example_label_file()) >>> print(labels[0]) (0, 50000, 'x^x-sil+hh=iy@x_x/A:0_0_0/B:x-x-x@x-x&x-x#x-x$x-x!x-x;x-x|x\ /C:1+1+2/D:0_0/E:x+x@x+x&x+x#x+x/F:content_1/G:0_0/H:x=x@1=2|0/I:4=3/\ J:13+9-2[2]') """ def __init__(self, frame_shift_in_micro_sec=50000): self.start_times = [] self.end_times = [] self.contexts = [] self.frame_shift_in_micro_sec = frame_shift_in_micro_sec def __len__(self): return len(self.start_times) def __getitem__(self, idx): return self.start_times[idx], self.end_times[idx], self.contexts[idx] def __str__(self): ret = "" for s, e, context in self: ret += "{} {} {}\n".format(s, e, context) return ret def __repr__(self): return str(self)
[docs] def set_durations(self, durations): """Set start/end times from duration features TODO: this should be refactored """ # Unwrap state-axis end_times = np.cumsum( durations.reshape(-1, 1) * self.frame_shift_in_micro_sec).astype(np.int) if len(end_times) != len(self.end_times): raise RuntimeError("Unexpected input, maybe") # Assuming first label starts with time `0` # Is this really true? probably no start_times = np.hstack((0, end_times[:-1])).astype(np.int) self.start_times, self.end_times = start_times, end_times
[docs] def load(self, path): """Load labels from file Args: path (str): File path """ with open(path) as f: lines = f.readlines() start_times = np.empty(len(lines), dtype=np.int) end_times = np.empty(len(lines), dtype=np.int) contexts = [] # TODO: consider comments? for idx, line in enumerate(lines): start_time, end_time, context = line[:-1].split(" ") start_times[idx] = int(start_time) end_times[idx] = int(end_time) contexts.append(context) self.start_times = start_times self.end_times = end_times self.contexts = np.array(contexts)
[docs] def silence_label_indices(self, regex=None): """Returns silence label indices Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence label indices """ if regex is None: regex = re.compile(".*-sil+.*") return np.where(list(map(regex.match, self.contexts)))[0]
[docs] def silence_phone_indices(self, regex=None): """Returns phone-level frame indices Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence label indices """ if regex is None: regex = re.compile(".*-sil+.*") state_number = 5 # TODO return np.unique(self.silence_label_indices(regex) // state_number)
[docs] def silence_frame_indices(self, regex=None): """Returns silence frame indices Similar to :func:`silence_label_indices`, but returns indices in frame-level. Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence frame indices """ if regex is None: regex = re.compile(".*-sil+.*") indices = self.silence_label_indices(regex) if len(indices) == 0: return np.empty(0) s = self.start_times[indices] // self.frame_shift_in_micro_sec e = self.end_times[indices] // self.frame_shift_in_micro_sec return np.unique(np.concatenate( [np.arange(a, b) for (a, b) in zip(s, e)], axis=0)).astype(np.int)
def is_state_alignment_label(self): return self.contexts[0][-1] == ']' and self.contexts[0][-3] == '['
[docs] def num_states(self): """Returnes number of states exclusing special begin/end states. """ if not self.is_state_alignment_label(): return 1 assert len(self) > 0 initial_state_num = int(self.contexts[0][-2]) largest_state_num = initial_state_num for label in self.contexts[1:]: n = int(label[-2]) if n > largest_state_num: largest_state_num = n else: break return largest_state_num - initial_state_num + 1
def num_phones(self): if self.is_state_alignment_label(): return len(self) // self.num_states() else: return len(self) def num_frames(self): return self.end_times[-1] // self.frame_shift_in_micro_sec
[docs]def load(path, frame_shift_in_micro_sec=50000): """Load HTS-style label file Args: path (str): Path of file. frame_shift_in_micro_sec (optional[int]): Frame shift in micro seconds. Default is 50000. Returns: labels (HTSLabelFile): Instance of HTSLabelFile. Examples: >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_label_file >>> labels = hts.load(example_label_file()) """ labels = HTSLabelFile(frame_shift_in_micro_sec) labels.load(path) return labels
def wildcards2regex(question, convert_number_pattern=False): """subphone_features Convert HTK-style question into regular expression for searching labels. If convert_number_pattern, keep the following sequences unescaped for extracting continuous values): (\d+) -- handles digit without decimal point ([\d\.]+) -- handles digits with and without decimal point """ # handle HTK wildcards (and lack of them) at ends of label: if '*' in question: if not question.startswith('*'): question = '\A' + question if not question.endswith('*'): question = question + '\Z' question = question.strip('*') question = re.escape(question) # convert remaining HTK wildcards * and ? to equivalent regex: question = question.replace('\\*', '.*') if convert_number_pattern: question = question.replace('\\(\\\\d\\+\\)', '(\d+)') question = question.replace( '\\(\\[\\\\d\\\\\\.\\]\\+\\)', '([\d\.]+)') return question
[docs]def load_question_set(qs_file_name): """Load HTS-style question and convert it to binary/continuous feature extraction regexes. This code was taken from Merin. Args: qs_file_name (str): Input HTS-style question file path Returns: (binary_dict, continuous_dict): Binary/continuous feature extraction regexes. Examples: >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_question_file >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file()) """ with open(qs_file_name) as f: lines = f.readlines() binary_qs_index = 0 continuous_qs_index = 0 binary_dict = {} continuous_dict = {} LL = re.compile(re.escape('LL-')) for line in lines: line = line.replace('\n', '') if len(line) > 5: temp_list = line.split('{') temp_line = temp_list[1] temp_list = temp_line.split('}') temp_line = temp_list[0] temp_line = temp_line.strip() question_list = temp_line.split(',') temp_list = line.split(' ') question_key = temp_list[1] if temp_list[0] == 'CQS': assert len(question_list) == 1 processed_question = wildcards2regex( question_list[0], convert_number_pattern=True) continuous_dict[str(continuous_qs_index)] = re.compile( processed_question) # save pre-compiled regular expression continuous_qs_index = continuous_qs_index + 1 elif temp_list[0] == 'QS': re_list = [] for temp_question in question_list: processed_question = wildcards2regex(temp_question) if LL.search(question_key): processed_question = '^' + processed_question re_list.append(re.compile(processed_question)) binary_dict[str(binary_qs_index)] = re_list binary_qs_index = binary_qs_index + 1 else: raise RuntimeError("Not supported question format") return binary_dict, continuous_dict