Source code for nnmnkwii.io.hts

# Part of code here is adapted from Merlin. Their license follows:
##########################################################################
#           The Neural Network (NN) based Speech Synthesis System
#                https://github.com/CSTR-Edinburgh/merlin
#
#                Centre for Speech Technology Research
#                     University of Edinburgh, UK
#                      Copyright (c) 2014-2015
#                        All Rights Reserved.
#
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
#  Permission is hereby granted, free of charge, to use and distribute
#  this software and its documentation without restriction, including
#  without limitation the rights to use, copy, modify, merge, publish,
#  distribute, sublicense, and/or sell copies of this work, and to
#  permit persons to whom this work is furnished to do so, subject to
#  the following conditions:
#
#   - Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#   - Redistributions in binary form must reproduce the above
#     copyright notice, this list of conditions and the following
#     disclaimer in the documentation and/or other materials provided
#     with the distribution.
#   - The authors' names may not be used to endorse or promote products derived
#     from this software without specific prior written permission.
#
#  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK
#  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
#  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
#  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE
#  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
#  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
#  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
#  THIS SOFTWARE.
##########################################################################

import re
from copy import copy

import numpy as np


[docs]class HTSLabelFile(object): """Memory representation for HTS-style context labels (a.k.a HTK alignment). Indexing is supported. It returns tuple of (``start_time``, ``end_time``, ``label``). Attributes: start_times (list): Start times in 100ns units. end_times (list): End times in 100ns units. contexts (list): Contexts. Each value should have either phone or full-context annotation. Examples: Load from file >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_label_file >>> labels = hts.load(example_label_file()) >>> print(labels[0]) (0, 50000, 'x^x-sil+hh=iy@x_x/A:0_0_0/B:x-x-x@x-x&x-x#x-x$x-x!x-x;x-x|x\ /C:1+1+2/D:0_0/E:x+x@x+x&x+x#x+x/F:content_1/G:0_0/H:x=x@1=2|0/I:4=3/\ J:13+9-2[2]') Create memory representation of label >>> labels = hts.HTSLabelFile() >>> labels.append((0, 3125000, "silB")) 0 3125000 silB >>> labels.append((3125000, 3525000, "m")) 0 3125000 silB 3125000 3525000 m >>> labels.append((3525000, 4325000, "i")) 0 3125000 silB 3125000 3525000 m 3525000 4325000 i Save to file >>> from tempfile import TemporaryFile >>> with TemporaryFile("w") as f: ... f.write(str(labels)) 50 """ def __init__(self, frame_shift=50000): self.start_times = [] self.end_times = [] self.contexts = [] self.frame_shift = frame_shift @classmethod def create_from_contexts(cls, contexts): return cls().load(None, contexts) def __len__(self): return len(self.start_times) def __getitem__(self, idx): if isinstance(idx, slice): # yes, this is inefficient and there will probably a bette way # but this is okay for now current, stop, _ = idx.indices(len(self)) obj = copy(self) obj.start_times = obj.start_times[current:stop] obj.end_times = obj.end_times[current:stop] obj.contexts = obj.contexts[current:stop] return obj elif isinstance(idx, list): obj = copy(self) obj.start_times = list(np.asarray(obj.start_times)[idx]) obj.end_times = list(np.asarray(obj.end_times)[idx]) obj.contexts = list(np.asarray(obj.contexts)[idx]) return obj else: return self.start_times[idx], self.end_times[idx], self.contexts[idx] def __str__(self): ret = "" if len(self.start_times) == 0: return ret for s, e, context in self: ret += "{} {} {}\n".format(s, e, context) return ret[:-1] def __repr__(self): return str(self) def round_(self): s = self.frame_shift self.start_times = list( np.round(np.asarray(self.start_times) / s).astype(np.int64) * s ) self.end_times = list( np.round(np.asarray(self.end_times) / s).astype(np.int64) * s ) return self
[docs] def append(self, label, strict=True): """Append a single alignment label Args: label (tuple): tuple of (start_time, end_time, context). strict (bool): strict mode. Returns: self Raises: ValueError: if start_time >= end_time ValueError: if last end time doesn't match start_time """ start_time, end_time, context = label start_time = int(start_time) end_time = int(end_time) if strict: if start_time >= end_time: raise ValueError( "end_time ({}) must be larger than start_time ({}).".format( end_time, start_time ) ) if len(self.end_times) > 0 and start_time != self.end_times[-1]: raise ValueError( "start_time ({}) must be equal to the last end_time ({}).".format( start_time, self.end_times[-1] ) ) self.start_times.append(start_time) self.end_times.append(end_time) self.contexts.append(context) return self
[docs] def set_durations(self, durations, frame_shift=50000): """Set start/end times from duration features TODO: this should be refactored """ offset = self.start_times[0] # Unwrap state-axis end_times = offset + np.cumsum(durations.reshape(-1, 1) * frame_shift).astype( np.int64 ) if len(end_times) != len(self.end_times): raise RuntimeError("Unexpected input, maybe") start_times = np.hstack((offset, end_times[:-1])).astype(np.int64) self.start_times, self.end_times = start_times, end_times
[docs] def load(self, path=None, lines=None): """Load labels from file Args: path (str): File path lines (list): Content of label file. If not None, construct HTSLabelFile directry from it instead of loading a file. Raises: ValueError: if the content of labels is empty. """ assert path is not None or lines is not None if lines is None: with open(path) as f: lines = f.readlines() else: if len(lines) == 0: raise ValueError( "Empty label is specifid! Please check if input contains a content." ) is_sec_format = False start_times = [] end_times = [] contexts = [] for line in lines: if line[0] == "#": continue cols = line.strip().split() if len(cols) == 3: start_time, end_time, context = cols if "." in start_time or "." in end_time: is_sec_format = True if is_sec_format: # convert sec to 100ns (HTS format) start_time = int(1e7 * float(start_time)) end_time = int(1e7 * float(end_time)) else: start_time = int(start_time) end_time = int(end_time) elif len(cols) == 1: start_time = -1 end_time = -1 context = cols[0] else: raise RuntimeError("Not supported for now") start_times.append(start_time) end_times.append(end_time) contexts.append(context) self.start_times = start_times self.end_times = end_times self.contexts = contexts return self
[docs] def silence_label_indices(self, regex=None): """Returns silence label indices Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence label indices """ if regex is None: regex = re.compile(".*-sil+.*") return np.where(list(map(regex.match, self.contexts)))[0]
[docs] def silence_phone_indices(self, regex=None): """Returns phone-level frame indices Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence label indices """ if regex is None: regex = re.compile(".*-sil+.*") return np.unique(self.silence_label_indices(regex) // self.num_states())
[docs] def silence_frame_indices(self, regex=None, frame_shift=50000): """Returns silence frame indices Similar to :func:`silence_label_indices`, but returns indices in frame-level. Args: regex (re(optional)): Compiled regex to find silence labels. Returns: 1darray: Silence frame indices """ if regex is None: regex = re.compile(".*-sil+.*") indices = self.silence_label_indices(regex) if len(indices) == 0: return np.empty(0) start_times = np.array(self.start_times) end_times = np.array(self.end_times) s = start_times[indices] // frame_shift e = end_times[indices] // frame_shift return np.unique( np.concatenate([np.arange(a, b) for (a, b) in zip(s, e)], axis=0) ).astype(np.int64)
def is_state_alignment_label(self): return self.contexts[0][-1] == "]" and self.contexts[0][-3] == "["
[docs] def num_states(self): """Returnes number of states exclusing special begin/end states.""" if not self.is_state_alignment_label(): return 1 assert len(self) > 0 initial_state_num = int(self.contexts[0][-2]) largest_state_num = initial_state_num for label in self.contexts[1:]: n = int(label[-2]) if n > largest_state_num: largest_state_num = n else: break return largest_state_num - initial_state_num + 1
def num_phones(self): if self.is_state_alignment_label(): return len(self) // self.num_states() else: return len(self) def num_frames(self, frame_shift=50000): return self.end_times[-1] // frame_shift
[docs]def load(path=None, lines=None): """Load HTS-style label file Args: path (str): Path of file. lines (list): Content of label file. If not None, construct HTSLabelFile directry from it instead of loading a file. Returns: labels (HTSLabelFile): Instance of HTSLabelFile. Examples: >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_label_file >>> labels = hts.load(example_label_file()) """ labels = HTSLabelFile() return labels.load(path, lines)
def wildcards2regex(question, convert_number_pattern=False, convert_svs_pattern=True): r"""subphone_features Convert HTK-style question into regular expression for searching labels. If convert_number_pattern, keep the following sequences unescaped for extracting continuous values): (\d+) -- handles digit without decimal point ([\d\.]+) -- handles digits with and without decimal point ([-\d]+) -- handles positive and negative numbers """ # handle HTK wildcards (and lack of them) at ends of label: prefix = "" postfix = "" if "*" in question: if not question.startswith("*"): prefix = "\\A" if not question.endswith("*"): postfix = "\\Z" question = question.strip("*") question = re.escape(question) # convert remaining HTK wildcards * and ? to equivalent regex: question = question.replace("\\*", ".*") question = prefix + question + postfix if convert_number_pattern: question = question.replace("\\(\\\\d\\+\\)", "(\\d+)") question = question.replace("\\(\\[\\-\\\\d\\]\\+\\)", "([-\\d]+)") question = question.replace("\\(\\[\\\\d\\\\\\.\\]\\+\\)", "([\\d\\.]+)") # NOTE: singing voice synthesis specific handling if convert_svs_pattern: question = question.replace( "\\(\\[A\\-Z\\]\\[b\\]\\?\\[0\\-9\\]\\+\\)", "([A-Z][b]?[0-9]+)" ) question = question.replace("\\(\\\\NOTE\\)", "([A-Z][b]?[0-9]+)") question = question.replace("\\(\\[pm\\]\\\\d\\+\\)", "([pm]\\d+)") return question
[docs]def load_question_set(qs_file_name, append_hat_for_LL=True, convert_svs_pattern=True): """Load HTS-style question and convert it to binary/continuous feature extraction regexes. This code was taken from Merlin. Args: qs_file_name (str): Input HTS-style question file path append_hat_for_LL (bool): Append ^ for LL regex search. Note that the most left context is assumed to be phoneme identity before the previous phoneme (i.e. LL-xx). This parameter should be False for the HTS-demo_NIT-SONG070-F001 demo. convert_svs_pattern (bool): Convert SVS specific patterns. Returns: (binary_dict, numeric_dict): Binary/numeric feature extraction regexes. Examples: >>> from nnmnkwii.io import hts >>> from nnmnkwii.util import example_question_file >>> binary_dict, numeric_dict = hts.load_question_set(example_question_file()) """ with open(qs_file_name) as f: lines = f.readlines() binary_qs_index = 0 continuous_qs_index = 0 binary_dict = {} numeric_dict = {} LL = re.compile(re.escape("LL-")) for line in lines: line = line.replace("\n", "") temp_list = line.split() if len(line) <= 0 or line.startswith("#"): continue name = temp_list[1].replace('"', "").replace("'", "") temp_list = line.split("{") temp_line = temp_list[1] temp_list = temp_line.split("}") temp_line = temp_list[0] temp_line = temp_line.strip() question_list = temp_line.split(",") temp_list = line.split(" ") question_key = temp_list[1] if temp_list[0] == "CQS": assert len(question_list) == 1 processed_question = wildcards2regex( question_list[0], convert_number_pattern=True, convert_svs_pattern=convert_svs_pattern, ) numeric_dict[continuous_qs_index] = ( name, re.compile(processed_question), ) # save pre-compiled regular expression continuous_qs_index = continuous_qs_index + 1 elif temp_list[0] == "QS": re_list = [] for temp_question in question_list: processed_question = wildcards2regex(temp_question) if ( append_hat_for_LL and LL.search(question_key) and processed_question[0] != "^" ): processed_question = "^" + processed_question re_list.append(re.compile(processed_question)) binary_dict[binary_qs_index] = (name, re_list) binary_qs_index = binary_qs_index + 1 else: raise RuntimeError("Not supported question format") return binary_dict, numeric_dict
[docs]def write_audacity_labels(dst_path, labels): """Write audacity labels from HTS-style labels Args: dst_path (str): The output file path. labels (HTSLabelFile): HTS style labels """ with open(dst_path, "w") as of: for s, e, l in labels: s, e = s * 1e-7, e * 1e-7 if "-" in l and "+" in l: ph = l.split("-")[1].split("+")[0] else: ph = l of.write("{:.4f}\t{:.4f}\t{}\n".format(s, e, ph))
[docs]def write_textgrid(dst_path, labels): """Write TextGrid from HTS-style labels Args: dst_path (str): The output file path. labels (HTSLabelFile): HTS style labels """ template = """File type = "ooTextFile" Object class = "TextGrid" xmin = 0 xmax = {xmax} tiers? <exists> size = 1 item []: item [1]: class = "IntervalTier" name = "phoneme" xmin = 0 xmax = {xmax} intervals: size = {size}""" template = template.format(xmax=labels.end_times[-1] * 1e-7, size=len(labels)) for idx, (s, e, l) in enumerate(labels): s, e = s * 1e-7, e * 1e-7 if "-" in l and "+" in l: ph = l.split("-")[1].split("+")[0] else: ph = l template += """ intervals [{idx}]: xmin = {s} xmax = {e} text = "{ph}" """.format( idx=idx + 1, s=s, e=e, ph=ph ) template += "\n" with open(dst_path, "w") as of: of.write(template)