# Part of code here is adapted from Merlin. Their license follows:
##########################################################################
# The Neural Network (NN) based Speech Synthesis System
# https://github.com/CSTR-Edinburgh/merlin
#
# Centre for Speech Technology Research
# University of Edinburgh, UK
# Copyright (c) 2014-2015
# All Rights Reserved.
#
# The system as a whole and most of the files in it are distributed
# under the following copyright and conditions
#
# Permission is hereby granted, free of charge, to use and distribute
# this software and its documentation without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this work, and to
# permit persons to whom this work is furnished to do so, subject to
# the following conditions:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# - The authors' names may not be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK
# DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
# SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE
# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
# THIS SOFTWARE.
##########################################################################
import re
from copy import copy
import numpy as np
[docs]class HTSLabelFile(object):
"""Memory representation for HTS-style context labels (a.k.a HTK alignment).
Indexing is supported. It returns tuple of
(``start_time``, ``end_time``, ``label``).
Attributes:
start_times (list): Start times in 100ns units.
end_times (list): End times in 100ns units.
contexts (list): Contexts. Each value should have either phone or
full-context annotation.
Examples:
Load from file
>>> from nnmnkwii.io import hts
>>> from nnmnkwii.util import example_label_file
>>> labels = hts.load(example_label_file())
>>> print(labels[0])
(0, 50000, 'x^x-sil+hh=iy@x_x/A:0_0_0/B:x-x-x@x-x&x-x#x-x$x-x!x-x;x-x|x\
/C:1+1+2/D:0_0/E:x+x@x+x&x+x#x+x/F:content_1/G:0_0/H:x=x@1=2|0/I:4=3/\
J:13+9-2[2]')
Create memory representation of label
>>> labels = hts.HTSLabelFile()
>>> labels.append((0, 3125000, "silB"))
0 3125000 silB
>>> labels.append((3125000, 3525000, "m"))
0 3125000 silB
3125000 3525000 m
>>> labels.append((3525000, 4325000, "i"))
0 3125000 silB
3125000 3525000 m
3525000 4325000 i
Save to file
>>> from tempfile import TemporaryFile
>>> with TemporaryFile("w") as f:
... f.write(str(labels))
50
"""
def __init__(self, frame_shift=50000):
self.start_times = []
self.end_times = []
self.contexts = []
self.frame_shift = frame_shift
@classmethod
def create_from_contexts(cls, contexts):
return cls().load(None, contexts)
def __len__(self):
return len(self.start_times)
def __getitem__(self, idx):
if isinstance(idx, slice):
# yes, this is inefficient and there will probably a bette way
# but this is okay for now
current, stop, _ = idx.indices(len(self))
obj = copy(self)
obj.start_times = obj.start_times[current:stop]
obj.end_times = obj.end_times[current:stop]
obj.contexts = obj.contexts[current:stop]
return obj
elif isinstance(idx, list):
obj = copy(self)
obj.start_times = list(np.asarray(obj.start_times)[idx])
obj.end_times = list(np.asarray(obj.end_times)[idx])
obj.contexts = list(np.asarray(obj.contexts)[idx])
return obj
else:
return self.start_times[idx], self.end_times[idx], self.contexts[idx]
def __str__(self):
ret = ""
if len(self.start_times) == 0:
return ret
for s, e, context in self:
ret += "{} {} {}\n".format(s, e, context)
return ret[:-1]
def __repr__(self):
return str(self)
def round_(self):
s = self.frame_shift
self.start_times = list(
np.round(np.asarray(self.start_times) / s).astype(np.int64) * s
)
self.end_times = list(
np.round(np.asarray(self.end_times) / s).astype(np.int64) * s
)
return self
[docs] def append(self, label, strict=True):
"""Append a single alignment label
Args:
label (tuple): tuple of (start_time, end_time, context).
strict (bool): strict mode.
Returns:
self
Raises:
ValueError: if start_time >= end_time
ValueError: if last end time doesn't match start_time
"""
start_time, end_time, context = label
start_time = int(start_time)
end_time = int(end_time)
if strict:
if start_time >= end_time:
raise ValueError(
"end_time ({}) must be larger than start_time ({}).".format(
end_time, start_time
)
)
if len(self.end_times) > 0 and start_time != self.end_times[-1]:
raise ValueError(
"start_time ({}) must be equal to the last end_time ({}).".format(
start_time, self.end_times[-1]
)
)
self.start_times.append(start_time)
self.end_times.append(end_time)
self.contexts.append(context)
return self
[docs] def set_durations(self, durations, frame_shift=50000):
"""Set start/end times from duration features
TODO:
this should be refactored
"""
offset = self.start_times[0]
# Unwrap state-axis
end_times = offset + np.cumsum(durations.reshape(-1, 1) * frame_shift).astype(
np.int64
)
if len(end_times) != len(self.end_times):
raise RuntimeError("Unexpected input, maybe")
start_times = np.hstack((offset, end_times[:-1])).astype(np.int64)
self.start_times, self.end_times = start_times, end_times
[docs] def load(self, path=None, lines=None):
"""Load labels from file
Args:
path (str): File path
lines (list): Content of label file. If not None, construct HTSLabelFile
directry from it instead of loading a file.
Raises:
ValueError: if the content of labels is empty.
"""
assert path is not None or lines is not None
if lines is None:
with open(path) as f:
lines = f.readlines()
else:
if len(lines) == 0:
raise ValueError(
"Empty label is specifid! Please check if input contains a content."
)
is_sec_format = False
start_times = []
end_times = []
contexts = []
for line in lines:
if line[0] == "#":
continue
cols = line.strip().split()
if len(cols) == 3:
start_time, end_time, context = cols
if "." in start_time or "." in end_time:
is_sec_format = True
if is_sec_format:
# convert sec to 100ns (HTS format)
start_time = int(1e7 * float(start_time))
end_time = int(1e7 * float(end_time))
else:
start_time = int(start_time)
end_time = int(end_time)
elif len(cols) == 1:
start_time = -1
end_time = -1
context = cols[0]
else:
raise RuntimeError("Not supported for now")
start_times.append(start_time)
end_times.append(end_time)
contexts.append(context)
self.start_times = start_times
self.end_times = end_times
self.contexts = contexts
return self
[docs] def silence_label_indices(self, regex=None):
"""Returns silence label indices
Args:
regex (re(optional)): Compiled regex to find silence labels.
Returns:
1darray: Silence label indices
"""
if regex is None:
regex = re.compile(".*-sil+.*")
return np.where(list(map(regex.match, self.contexts)))[0]
[docs] def silence_phone_indices(self, regex=None):
"""Returns phone-level frame indices
Args:
regex (re(optional)): Compiled regex to find silence labels.
Returns:
1darray: Silence label indices
"""
if regex is None:
regex = re.compile(".*-sil+.*")
return np.unique(self.silence_label_indices(regex) // self.num_states())
[docs] def silence_frame_indices(self, regex=None, frame_shift=50000):
"""Returns silence frame indices
Similar to :func:`silence_label_indices`, but returns indices in frame-level.
Args:
regex (re(optional)): Compiled regex to find silence labels.
Returns:
1darray: Silence frame indices
"""
if regex is None:
regex = re.compile(".*-sil+.*")
indices = self.silence_label_indices(regex)
if len(indices) == 0:
return np.empty(0)
start_times = np.array(self.start_times)
end_times = np.array(self.end_times)
s = start_times[indices] // frame_shift
e = end_times[indices] // frame_shift
return np.unique(
np.concatenate([np.arange(a, b) for (a, b) in zip(s, e)], axis=0)
).astype(np.int64)
def is_state_alignment_label(self):
return self.contexts[0][-1] == "]" and self.contexts[0][-3] == "["
[docs] def num_states(self):
"""Returnes number of states exclusing special begin/end states."""
if not self.is_state_alignment_label():
return 1
assert len(self) > 0
initial_state_num = int(self.contexts[0][-2])
largest_state_num = initial_state_num
for label in self.contexts[1:]:
n = int(label[-2])
if n > largest_state_num:
largest_state_num = n
else:
break
return largest_state_num - initial_state_num + 1
def num_phones(self):
if self.is_state_alignment_label():
return len(self) // self.num_states()
else:
return len(self)
def num_frames(self, frame_shift=50000):
return self.end_times[-1] // frame_shift
[docs]def load(path=None, lines=None):
"""Load HTS-style label file
Args:
path (str): Path of file.
lines (list): Content of label file. If not None, construct HTSLabelFile
directry from it instead of loading a file.
Returns:
labels (HTSLabelFile): Instance of HTSLabelFile.
Examples:
>>> from nnmnkwii.io import hts
>>> from nnmnkwii.util import example_label_file
>>> labels = hts.load(example_label_file())
"""
labels = HTSLabelFile()
return labels.load(path, lines)
def wildcards2regex(question, convert_number_pattern=False, convert_svs_pattern=True):
r"""subphone_features
Convert HTK-style question into regular expression for searching labels.
If convert_number_pattern, keep the following sequences unescaped for
extracting continuous values):
(\d+) -- handles digit without decimal point
([\d\.]+) -- handles digits with and without decimal point
([-\d]+) -- handles positive and negative numbers
"""
# handle HTK wildcards (and lack of them) at ends of label:
prefix = ""
postfix = ""
if "*" in question:
if not question.startswith("*"):
prefix = "\\A"
if not question.endswith("*"):
postfix = "\\Z"
question = question.strip("*")
question = re.escape(question)
# convert remaining HTK wildcards * and ? to equivalent regex:
question = question.replace("\\*", ".*")
question = prefix + question + postfix
if convert_number_pattern:
question = question.replace("\\(\\\\d\\+\\)", "(\\d+)")
question = question.replace("\\(\\[\\-\\\\d\\]\\+\\)", "([-\\d]+)")
question = question.replace("\\(\\[\\\\d\\\\\\.\\]\\+\\)", "([\\d\\.]+)")
# NOTE: singing voice synthesis specific handling
if convert_svs_pattern:
question = question.replace(
"\\(\\[A\\-Z\\]\\[b\\]\\?\\[0\\-9\\]\\+\\)", "([A-Z][b]?[0-9]+)"
)
question = question.replace("\\(\\\\NOTE\\)", "([A-Z][b]?[0-9]+)")
question = question.replace("\\(\\[pm\\]\\\\d\\+\\)", "([pm]\\d+)")
return question
[docs]def load_question_set(qs_file_name, append_hat_for_LL=True, convert_svs_pattern=True):
"""Load HTS-style question and convert it to binary/continuous feature
extraction regexes.
This code was taken from Merlin.
Args:
qs_file_name (str): Input HTS-style question file path
append_hat_for_LL (bool): Append ^ for LL regex search.
Note that the most left context is assumed to be phoneme identity
before the previous phoneme (i.e. LL-xx). This parameter should be False
for the HTS-demo_NIT-SONG070-F001 demo.
convert_svs_pattern (bool): Convert SVS specific patterns.
Returns:
(binary_dict, numeric_dict): Binary/numeric feature extraction
regexes.
Examples:
>>> from nnmnkwii.io import hts
>>> from nnmnkwii.util import example_question_file
>>> binary_dict, numeric_dict = hts.load_question_set(example_question_file())
"""
with open(qs_file_name) as f:
lines = f.readlines()
binary_qs_index = 0
continuous_qs_index = 0
binary_dict = {}
numeric_dict = {}
LL = re.compile(re.escape("LL-"))
for line in lines:
line = line.replace("\n", "")
temp_list = line.split()
if len(line) <= 0 or line.startswith("#"):
continue
name = temp_list[1].replace('"', "").replace("'", "")
temp_list = line.split("{")
temp_line = temp_list[1]
temp_list = temp_line.split("}")
temp_line = temp_list[0]
temp_line = temp_line.strip()
question_list = temp_line.split(",")
temp_list = line.split(" ")
question_key = temp_list[1]
if temp_list[0] == "CQS":
assert len(question_list) == 1
processed_question = wildcards2regex(
question_list[0],
convert_number_pattern=True,
convert_svs_pattern=convert_svs_pattern,
)
numeric_dict[continuous_qs_index] = (
name,
re.compile(processed_question),
) # save pre-compiled regular expression
continuous_qs_index = continuous_qs_index + 1
elif temp_list[0] == "QS":
re_list = []
for temp_question in question_list:
processed_question = wildcards2regex(temp_question)
if (
append_hat_for_LL
and LL.search(question_key)
and processed_question[0] != "^"
):
processed_question = "^" + processed_question
re_list.append(re.compile(processed_question))
binary_dict[binary_qs_index] = (name, re_list)
binary_qs_index = binary_qs_index + 1
else:
raise RuntimeError("Not supported question format")
return binary_dict, numeric_dict
[docs]def write_audacity_labels(dst_path, labels):
"""Write audacity labels from HTS-style labels
Args:
dst_path (str): The output file path.
labels (HTSLabelFile): HTS style labels
"""
with open(dst_path, "w") as of:
for s, e, l in labels:
s, e = s * 1e-7, e * 1e-7
if "-" in l and "+" in l:
ph = l.split("-")[1].split("+")[0]
else:
ph = l
of.write("{:.4f}\t{:.4f}\t{}\n".format(s, e, ph))
[docs]def write_textgrid(dst_path, labels):
"""Write TextGrid from HTS-style labels
Args:
dst_path (str): The output file path.
labels (HTSLabelFile): HTS style labels
"""
template = """File type = "ooTextFile"
Object class = "TextGrid"
xmin = 0
xmax = {xmax}
tiers? <exists>
size = 1
item []:
item [1]:
class = "IntervalTier"
name = "phoneme"
xmin = 0
xmax = {xmax}
intervals: size = {size}"""
template = template.format(xmax=labels.end_times[-1] * 1e-7, size=len(labels))
for idx, (s, e, l) in enumerate(labels):
s, e = s * 1e-7, e * 1e-7
if "-" in l and "+" in l:
ph = l.split("-")[1].split("+")[0]
else:
ph = l
template += """
intervals [{idx}]:
xmin = {s}
xmax = {e}
text = "{ph}" """.format(
idx=idx + 1, s=s, e=e, ph=ph
)
template += "\n"
with open(dst_path, "w") as of:
of.write(template)