import numpy as np
import pysptk
import pyworld
import torch
from nnmnkwii.frontend import merlin as fe
from nnmnkwii.postfilters import merlin_post_filter
from ttslearn.dnntts.multistream import (
get_static_stream_sizes,
get_windows,
multi_stream_mlpg,
split_streams,
)
[docs]@torch.no_grad()
def predict_duration(
device, # cpu or cuda
labels, # フルコンテキストラベル
duration_model, # 学習済み継続長モデル
duration_config, # 継続長モデルの設定
duration_in_scaler, # 言語特徴量の正規化用 StandardScaler
duration_out_scaler, # 音素継続長の正規化用 StandardScaler
binary_dict, # 二値特徴量を抽出する正規表現
numeric_dict, # 数値特徴量を抽出する正規表現
):
"""Predict phoneme durations.
Args:
device (torch.device): pytorch device
labels (list): list of labels
duration_model (nn.Module): trained duration model
duration_config (dict): configuration of duration model
duration_in_scaler (sklearn.preprocessing.StandardScaler):
StandardScaler of duration features
duration_out_scaler (sklearn.preprocessing.StandardScaler):
StandardScaler of duration output
binary_dict (dict): dictionary of binary features
numeric_dict (dict): dictionary of numeric features
Returns:
numpy.ndarray: predicted durations
"""
# 言語特徴量の抽出
in_feats = fe.linguistic_features(labels, binary_dict, numeric_dict).astype(
np.float32
)
# 言語特徴量の正規化
in_feats = duration_in_scaler.transform(in_feats)
# 継続長の予測
x = torch.from_numpy(in_feats).float().to(device).view(1, -1, in_feats.shape[-1])
pred_durations = duration_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
# 予測された継続長に対して、正規化の逆変換を行います
pred_durations = duration_out_scaler.inverse_transform(pred_durations)
# 閾値処理
pred_durations[pred_durations <= 0] = 1
pred_durations = np.round(pred_durations)
return pred_durations
[docs]@torch.no_grad()
def predict_acoustic(
device, # CPU or GPU
labels, # フルコンテキストラベル
acoustic_model, # 学習済み音響モデル
acoustic_config, # 音響モデルの設定
acoustic_in_scaler, # 言語特徴量の正規化用 StandardScaler
acoustic_out_scaler, # 音響特徴量の正規化用 StandardScaler
binary_dict, # 二値特徴量を抽出する正規表現
numeric_dict, # 数値特徴量を抽出する正規表現
mlpg=True, # MLPG を使用するかどうか
):
"""Predict acoustic features.
Args:
device (torch.device): pytorch device
labels (list): list of labels
acoustic_model (nn.Module): trained acoustic model
acoustic_config (dict): configuration of acoustic model
acoustic_in_scaler (sklearn.preprocessing.StandardScaler):
StandardScaler of acoustic features
acoustic_out_scaler (sklearn.preprocessing.StandardScaler):
StandardScaler of acoustic output
binary_dict (dict): dictionary of binary features
numeric_dict (dict): dictionary of numeric features
mlpg (bool): whether to use MLPG
Returns:
numpy.ndarray: predicted acoustic features
"""
# フレーム単位の言語特徴量の抽出
in_feats = fe.linguistic_features(
labels,
binary_dict,
numeric_dict,
add_frame_features=True,
subphone_features="coarse_coding",
)
# 正規化
in_feats = acoustic_in_scaler.transform(in_feats)
# 音響特徴量の予測
x = torch.from_numpy(in_feats).float().to(device).view(1, -1, in_feats.shape[-1])
pred_acoustic = acoustic_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
# 予測された音響特徴量に対して、正規化の逆変換を行います
pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
# パラメータ生成アルゴリズム (MLPG) の実行
if mlpg and np.any(acoustic_config.has_dynamic_features):
# (T, D_out) -> (T, static_dim)
pred_acoustic = multi_stream_mlpg(
pred_acoustic,
acoustic_out_scaler.var_,
get_windows(acoustic_config.num_windows),
acoustic_config.stream_sizes,
acoustic_config.has_dynamic_features,
)
return pred_acoustic