# Acknowledgements:
# mask-related functions were adapted from https://github.com/espnet/espnet
import importlib
import random
from functools import partial
from pathlib import Path
from typing import Any
import numpy as np
import pkg_resources
import torch
# see COPYING for the license of the audio file.
EXAMPLE_AUDIO = "_example_data/BASIC5000_0001.wav"
EXAMPLE_LABEL = "_example_data/BASIC5000_0001.lab"
EXAMPLE_MONO_LABEL = "_example_data/BASIC5000_0001_mono.lab"
EXAMPLE_QST = "_example_data/qst1.hed"
[docs]def init_seed(seed):
"""Initialize random seed.
Args:
seed (int): random seed
"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
[docs]def dynamic_import(name: str) -> Any:
"""Dynamic import
Args:
name (str): module_name + ":" + class_name
Returns:
Any: class object
"""
mod_name, class_name = name.split(":")
mod = importlib.import_module(mod_name)
return getattr(mod, class_name)
[docs]def make_pad_mask(lengths, maxlen=None):
"""Make mask for padding frames
Args:
lengths (list): list of lengths
maxlen (int, optional): maximum length. If None, use max value of lengths.
Returns:
torch.ByteTensor: mask
"""
if not isinstance(lengths, list):
lengths = lengths.tolist()
bs = int(len(lengths))
if maxlen is None:
maxlen = int(max(lengths))
seq_range = torch.arange(0, maxlen, dtype=torch.int64)
seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
mask = seq_range_expand >= seq_length_expand
return mask
[docs]def make_non_pad_mask(lengths, maxlen=None):
"""Make mask for non-padding frames
Args:
lengths (list): list of lengths
maxlen (int, optional): maximum length. If None, use max value of lengths.
Returns:
torch.ByteTensor: mask
"""
return ~make_pad_mask(lengths, maxlen)
[docs]def example_audio_file() -> str:
"""Get the path to an included audio example file.
Examples
--------
>>> from scipy.io import wavfile
>>> fs, x = wavfile.read(pysptk.util.example_audio_file())
>>> import matplotlib.pyplot as plt
>>> plt.plot(x, label="cmu_us_awb_arctic arctic_a0007.wav")
>>> plt.xlim(0, len(x))
>>> plt.legend()
"""
return pkg_resources.resource_filename(__name__, EXAMPLE_AUDIO)
[docs]def example_label_file(mono=False):
"""Get the path to an included label file.
Args:
mono (bool, optional): If True, return monophonic label file.
Default: False
Returns:
str: path to an example label file
"""
if mono:
return pkg_resources.resource_filename(__name__, EXAMPLE_MONO_LABEL)
return pkg_resources.resource_filename(__name__, EXAMPLE_LABEL)
[docs]def example_qst_file():
"""Get the path to an included question set file.
Returns:
str: path to an example question file.
"""
return pkg_resources.resource_filename(__name__, EXAMPLE_QST)
[docs]def pad_1d(x, max_len, constant_values=0):
"""Pad a 1d-tensor.
Args:
x (torch.Tensor): tensor to pad
max_len (int): maximum length of the tensor
constant_values (int, optional): value to pad with. Default: 0
Returns:
torch.Tensor: padded tensor
"""
x = np.pad(
x,
(0, max_len - len(x)),
mode="constant",
constant_values=constant_values,
)
return x
[docs]def pad_2d(x, max_len, constant_values=0):
"""Pad a 2d-tensor.
Args:
x (torch.Tensor): tensor to pad
max_len (int): maximum length of the tensor
constant_values (int, optional): value to pad with. Default: 0
Returns:
torch.Tensor: padded tensor
"""
x = np.pad(
x,
[(0, max_len - len(x)), (0, 0)],
mode="constant",
constant_values=constant_values,
)
return x
[docs]def load_utt_list(utt_list):
"""Load a list of utterances.
Args:
utt_list (str): path to a file containing a list of utterances
Returns:
List[str]: list of utterances
"""
utt_ids = []
with open(utt_list) as f:
for utt_id in f:
utt_id = utt_id.strip()
if len(utt_id) > 0:
utt_ids.append(utt_id)
return utt_ids
[docs]def trim_silence(feats, labels, start_sec=0.05, end_sec=0.1, shift_sec=0.005):
"""Trim silence from input features.
Args:
feats (np.ndarray): input features
labels (np.ndarray): labels
start_sec (float, optional): start time of the trim
end_sec (float, optional): end time of the trim
shift_sec (float, optional): shift of the trim
Returns:
np.ndarray: trimmed features
"""
assert "sil" in labels.contexts[0] and "sil" in labels.contexts[-1]
start_frame = int(labels.start_times[1] / 50000)
end_frame = int(labels.end_times[-2] / 50000)
start_frame = max(0, start_frame - int(start_sec / shift_sec))
end_frame = min(len(feats), end_frame + int(end_sec / shift_sec))
feats = feats[start_frame:end_frame]
return feats
[docs]def find_feats(directory, utt_id, typ="out_duration", ext="-feats.npy"):
"""Find features for a given utterance.
Args:
directory (str): directory to search
utt_id (str): utterance id
typ (str, optional): type of the feature. Default: "out_duration"
ext (str, optional): extension of the feature. Default: "-feats.npy"
Returns:
str: path to the feature file
"""
if isinstance(directory, str):
directory = Path(directory)
ps = sorted(directory.rglob(f"**/{typ}/{utt_id}{ext}"))
return ps[0]
[docs]def find_lab(directory, utt_id):
"""Find label for a given utterance.
Args:
directory (str): directory to search
utt_id (str): utterance id
Returns:
str: path to the label file
"""
if isinstance(directory, str):
directory = Path(directory)
ps = sorted(directory.rglob(f"{utt_id}.lab"))
assert len(ps) == 1
return ps[0]
[docs]def lab2phonemes(labels):
"""Convert labels to phonemes.
Args:
labels (str): path to a label file
Returns:
List[str]: phoneme sequence
"""
phonemes = []
for c in labels.contexts:
if "-" in c:
ph = c.split("-")[1].split("+")[0]
else:
ph = c
phonemes.append(ph)
return phonemes
[docs]def optional_tqdm(tqdm_mode, **kwargs):
"""Get a tqdm object.
Args:
tqdm_mode (str): tqdm mode
**kwargs: keyword arguments for tqdm
Returns:
callable: tqdm object or an identity function
"""
if tqdm_mode == "tqdm":
from tqdm import tqdm
return partial(tqdm, **kwargs)
elif tqdm_mode == "tqdm-notebook":
from tqdm.notebook import tqdm
return partial(tqdm, **kwargs)
return lambda x: x
class StandardScaler:
"""sklearn.preprocess.StandardScaler like class with only
transform functionality
Args:
mean (np.ndarray): mean
std (np.ndarray): standard deviation
"""
def __init__(self, mean, var, scale):
self.mean_ = mean
self.var_ = var
# NOTE: scale may not exactly same as np.sqrt(var)
self.scale_ = scale
def transform(self, x):
return (x - self.mean_) / self.scale_
def inverse_transform(self, x):
return x * self.scale_ + self.mean_