import librosa
import numpy as np
import pysptk
import pyworld
from nnmnkwii.preprocessing import delta_features
from nnmnkwii.preprocessing.f0 import interp1d
[docs]def f0_to_lf0(f0):
"""Convert F0 to log-F0
Args:
f0 (ndarray): F0 in Hz.
Returns:
ndarray: log-F0.
"""
lf0 = f0.copy()
nonzero_indices = np.nonzero(f0)
lf0[nonzero_indices] = np.log(f0[nonzero_indices])
return lf0
[docs]def lf0_to_f0(lf0, vuv):
"""Convert log-F0 (and V/UV) to F0
Args:
lf0 (ndarray): F0 in Hz.
vuv (ndarray): V/UV.
Returns:
ndarray: F0 in Hz.
"""
f0 = np.exp(lf0)
f0[vuv < 0.5] = 0
return f0
[docs]def compute_delta(x, coef):
"""Compute delta features
Args:
x (ndarray): Feature vector.
coef (ndarray): Coefficients.
Returns:
ndarray: Delta features.
"""
y = np.zeros_like(x)
# 特徴量の次元ごとに動的特徴量を計算
for d in range(x.shape[1]):
y[:, d] = np.correlate(x[:, d], coef, mode="same")
return y
[docs]def world_log_f0_vuv(x, sr):
"""WORLD-based log-F0 and V/UV extraction
Args:
x (numpy.ndarray): Waveform.
sr (int): Sampling rate.
Returns:
ndarray: Log-F0 and V/UV.
"""
f0, timeaxis = pyworld.dio(x, sr)
# (Optional) Stonemask によってF0の推定結果をrefineする
f0 = pyworld.stonemask(x, f0, timeaxis, sr)
vuv = (f0 > 0).astype(np.float32)
# 連続対数基本周波数
lf0 = f0_to_lf0(f0)
lf0 = interp1d(lf0)
# 連続対数基本周波数と有声/無声フラグを2次元の行列の形にしておく
lf0 = lf0[:, np.newaxis] if len(lf0.shape) == 1 else lf0
vuv = vuv[:, np.newaxis] if len(vuv.shape) == 1 else vuv
# 動的特徴量の計算
windows = [
[1.0], # 静的特徴量に対する窓
[-0.5, 0.0, 0.5], # 1次動的特徴量に対する窓
[1.0, -2.0, 1.0], # 2次動的特徴量に対する窓
]
lf0 = delta_features(lf0, windows)
# すべての特徴量を結合
feats = np.hstack([lf0, vuv]).astype(np.float32)
return feats
[docs]def world_spss_params(x, sr, mgc_order=None):
"""WORLD-based acoustic feature extraction
Args:
x (ndarray): Waveform.
sr (int): Sampling rate.
mgc_order (int, optional): MGC order. Defaults to None.
Returns:
ndarray: WORLD features.
"""
f0, timeaxis = pyworld.dio(x, sr)
# (Optional) Stonemask によってF0の推定結果をrefineする
f0 = pyworld.stonemask(x, f0, timeaxis, sr)
sp = pyworld.cheaptrick(x, f0, timeaxis, sr)
ap = pyworld.d4c(x, f0, timeaxis, sr)
alpha = pysptk.util.mcepalpha(sr)
# メルケプストラムの次元数(※過去の論文にならい、16kHzの際に
# 次元数が40(mgc_order + 1)になるように設定する
# ただし、上限を 60 (59 + 1) とします
# [Zen 2013] Statistical parametric speech synthesis using deep neural networks
if mgc_order is None:
mgc_order = min(int(sr / 16000.0 * 40) - 1, 59)
mgc = pysptk.sp2mc(sp, mgc_order, alpha)
# 有声/無声フラグ
vuv = (f0 > 0).astype(np.float32)
# 連続対数F0
lf0 = f0_to_lf0(f0)
lf0 = interp1d(lf0)
# 帯域非周期性指標
bap = pyworld.code_aperiodicity(ap, sr)
# F0とvuvを二次元の行列の形にしておく
lf0 = lf0[:, np.newaxis] if len(lf0.shape) == 1 else lf0
vuv = vuv[:, np.newaxis] if len(vuv.shape) == 1 else vuv
# 動的特徴量の計算
windows = [
[1.0], # 静的特徴量に対する窓
[-0.5, 0.0, 0.5], # 1次動的特徴量に対する窓
[1.0, -2.0, 1.0], # 2次動的特徴量に対する窓
]
mgc = delta_features(mgc, windows)
lf0 = delta_features(lf0, windows)
bap = delta_features(bap, windows)
feats = np.hstack([mgc, lf0, vuv, bap]).astype(np.float32)
return feats
[docs]def mulaw(x, mu=255):
"""Mu-Law companding.
Args:
x (ndarray): Input signal.
mu (int): Mu.
Returns:
ndarray: Compressed signal.
"""
return np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
def quantize(y, mu=255, offset=1):
"""Quantize the signal
Args:
y (ndarray): Input signal.
mu (int): Mu.
offset (int): Offset.
Returns:
ndarray: Quantized signal.
"""
# [-1, 1] -> [0, 2] -> [0, 1] -> [0, mu]
return ((y + offset) / 2 * mu).astype(np.int64)
[docs]def mulaw_quantize(x, mu=255):
"""Mu-law-quantize signal.
Args:
x (ndarray): Input signal.
mu (int): Mu.
Returns:
ndarray: Quantized signal.
"""
return quantize(mulaw(x, mu), mu)
[docs]def inv_mulaw(y, mu=255):
"""Inverse transformation of mu-law companding
Args:
y (ndarray): Input signal.
mu (int): Mu.
Returns:
ndarray: Uncompressed signal.
"""
return np.sign(y) * (1.0 / mu) * ((1.0 + mu) ** np.abs(y) - 1.0)
def inv_quantize(y, mu):
"""De-quantization.
Args:
y (ndarray): Input signal.
mu (int): Mu.
Returns:
ndarray: Unquantized signal.
"""
# [0, mu] -> [-1, 1]
return 2 * y.astype(np.float32) / mu - 1
[docs]def inv_mulaw_quantize(y, mu=255):
"""Inverse transformation of mu-law quantization.
Args:
y (ndarray): Input signal.
mu (int): Mu.
Returns:
ndarray: Unquantized signal.
"""
return inv_mulaw(inv_quantize(y, mu), mu)
[docs]def logspectrogram(
y,
sr,
n_fft=None,
hop_length=None,
win_length=None,
clip=0.001,
):
"""Compute log-spectrogram.
Args:
y (ndarray): Waveform.
sr (int): Sampling rate.
n_fft (int, optional): FFT size.
hop_length (int, optional): Hop length. Defaults to 12.5ms.
win_length (int, optional): Window length. Defaults to 50 ms.
clip (float, optional): Clip the magnitude. Defaults to 0.001.
Returns:
numpy.ndarray: Log-spectrogram.
"""
if hop_length is None:
hop_length = int(sr * 0.0125)
if win_length is None:
win_length = int(sr * 0.050)
if n_fft is None:
n_fft = next_power_of_2(win_length)
S = librosa.stft(
y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window="hanning"
)
# スペクトログラムのクリッピング
# NOTE: クリッピングの値は、データに依存して調整する必要があります。
# Tacotron 2の論文では 0.01 です
S = np.maximum(np.abs(S), clip)
# 対数を取る
S = np.log10(S)
# Time first: (T, N)
return S.T
def next_power_of_2(x):
return 1 if x == 0 else 2 ** (x - 1).bit_length()
[docs]def logmelspectrogram(
y,
sr,
n_fft=None,
hop_length=None,
win_length=None,
n_mels=80,
fmin=None,
fmax=None,
clip=0.001,
):
"""Compute log-melspectrogram.
Args:
y (ndarray): Waveform.
sr (int): Sampling rate.
n_fft (int, optional): FFT size.
hop_length (int, optional): Hop length. Defaults to 12.5ms.
win_length (int, optional): Window length. Defaults to 50 ms.
n_mels (int, optional): Number of mel bins. Defaults to 80.
fmin (int, optional): Minimum frequency. Defaults to 0.
fmax (int, optional): Maximum frequency. Defaults to sr / 2.
clip (float, optional): Clip the magnitude. Defaults to 0.001.
Returns:
numpy.ndarray: Log-melspectrogram.
"""
if hop_length is None:
hop_length = int(sr * 0.0125)
if win_length is None:
win_length = int(sr * 0.050)
if n_fft is None:
n_fft = next_power_of_2(win_length)
S = librosa.stft(
y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window="hanning"
)
fmin = 0 if fmin is None else fmin
fmax = sr // 2 if fmax is None else fmax
# メルフィルタバンク
mel_basis = librosa.filters.mel(sr, n_fft, fmin=fmin, fmax=fmax, n_mels=n_mels)
# スペクトログラム -> メルスペクトログラム
S = np.dot(mel_basis, np.abs(S))
# クリッピング
S = np.maximum(S, clip)
# 対数を取る
S = np.log10(S)
# Time first: (T, N)
return S.T
[docs]def logmelspectrogram_to_audio(
logmel,
sr,
n_fft=None,
hop_length=None,
win_length=None,
fmin=None,
fmax=None,
n_iter=4,
):
"""Log-melspectrogram to audio.
Args:
logmel (ndarray): Log-melspectrogram.
sr (int): Sampling rate.
n_fft (int, optional): FFT size.
hop_length (int, optional): Hop length. Defaults to 12.5ms.
win_length (int, optional): Window length. Defaults to 50 ms.
fmin (int, optional): Minimum frequency. Defaults to 0.
fmax (int, optional): Maximum frequency. Defaults to sr / 2.
n_iter (int, optional): Number of power iterations. Defaults to 4.
Returns:
numpy.ndarray: Waveform.
"""
if hop_length is None:
hop_length = int(sr * 0.0125)
if win_length is None:
win_length = int(sr * 0.050)
if n_fft is None:
n_fft = next_power_of_2(win_length)
fmin = 0 if fmin is None else fmin
fmax = sr // 2 if fmax is None else fmax
mel = np.exp(logmel * np.log(10)).T
S = librosa.feature.inverse.mel_to_stft(
mel,
n_fft=n_fft,
power=1.0,
sr=sr,
fmin=fmin,
fmax=fmax,
)
y = librosa.griffinlim(
S, hop_length=hop_length, win_length=win_length, window="hanning", n_iter=n_iter
)
return y