from __future__ import absolute_import, print_function, with_statement
import math
import numpy as np
_logdb_const = 10.0 / np.log(10.0) * np.sqrt(2.0)
# should work on torch and numpy arrays
def _sqrt(x):
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return np.sqrt(x) if isnumpy else math.sqrt(x) if isscalar else x.sqrt()
def _exp(x):
isnumpy = isinstance(x, np.ndarray)
isscalar = np.isscalar(x)
return np.exp(x) if isnumpy else math.exp(x) if isscalar else x.exp()
def _sum(x):
if isinstance(x, list) or isinstance(x, np.ndarray):
return np.sum(x)
return float(x.sum())
[docs]def melcd(X, Y, lengths=None):
"""Mel-cepstrum distortion (MCD).
The function computes MCD for time-aligned mel-cepstrum sequences.
Args:
X (ndarray): Input mel-cepstrum, shape can be either of
(``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
are supported.
Y (ndarray): Target mel-cepstrum, shape can be either of
(``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
are supported.
lengths (list): Lengths of padded inputs. This should only be specified
if you give mini-batch inputs.
Returns:
float: Mean mel-cepstrum distortion in dB.
.. note::
The function doesn't check if inputs are actually mel-cepstrum.
"""
# summing against feature axis, and then take mean against time axis
# Eq. (1a)
# https://www.cs.cmu.edu/~awb/papers/sltu2008/kominek_black.sltu_2008.pdf
if lengths is None:
z = X - Y
r = _sqrt((z * z).sum(-1))
if not np.isscalar(r):
r = r.mean()
return _logdb_const * float(r)
# Case for 1-dim features.
if len(X.shape) == 2:
# Add feature axis
X, Y = X[:, :, None], Y[:, :, None]
s = 0.0
T = _sum(lengths)
for x, y, length in zip(X, Y, lengths):
x, y = x[:length], y[:length]
z = x - y
s += _sqrt((z * z).sum(-1)).sum()
return _logdb_const * float(s) / float(T)
[docs]def mean_squared_error(X, Y, lengths=None):
"""Mean squared error (MSE).
Args:
X (ndarray): Input features, shape can be either of
(``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
are supported.
Y (ndarray): Target features, shape can be either of
(``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
are supported.
lengths (list): Lengths of padded inputs. This should only be specified
if you give mini-batch inputs.
Returns:
float: Mean squared error.
.. tip::
The function supports 3D padded inputs, while
:func:`sklearn.metrics.mean_squared_error` doesn't support.
"""
if lengths is None:
z = X - Y
return math.sqrt(float((z * z).mean()))
T = _sum(lengths) * X.shape[-1]
s = 0.0
for x, y, length in zip(X, Y, lengths):
x, y = x[:length], y[:length]
z = x - y
s += (z * z).sum()
return math.sqrt(float(s) / float(T))
[docs]def lf0_mean_squared_error(
src_f0, src_vuv, tgt_f0, tgt_vuv, lengths=None, linear_domain=False
):
"""Mean squared error (MSE) for log-F0 sequences.
MSE is computed for voiced segments.
Args:
src_f0 (ndarray): Input log-F0 sequences, shape can be either of
(``T``,), (``B x T``) or (``B x T x 1``). Both Numpy and torch arrays
are supported.
src_vuv (ndarray): Input voiced/unvoiced flag array, shape can be either
of (``T``, ), (``B x T``) or (``B x T x 1``).
tgt_f0 (ndarray): Target log-F0 sequences, shape can be either of
(``T``,), (``B x T``) or (``B x T x 1``). Both Numpy and torch arrays
are supported.
tgt_vuv (ndarray): Target voiced/unvoiced flag array, shape can be either
of (``T``, ), (``B x T``) or (``B x T x 1``).
lengths (list): Lengths of padded inputs. This should only be specified
if you give mini-batch inputs.
linear_domain (bool): Whether computes MSE on linear frequecy domain or
log-frequency domain.
Returns:
float: mean squared error.
"""
if lengths is None:
voiced_indices = (src_vuv + tgt_vuv) >= 2
x = src_f0[voiced_indices]
y = tgt_f0[voiced_indices]
if linear_domain:
x, y = _exp(x), _exp(y)
return mean_squared_error(x, y)
T = 0
s = 0.0
for x, x_vuv, y, y_vuv, length in zip(src_f0, src_vuv, tgt_f0, tgt_vuv, lengths):
x, x_vuv = x[:length], x_vuv[:length]
y, y_vuv = y[:length], y_vuv[:length]
voiced_indices = (x_vuv + y_vuv) >= 2
T += voiced_indices.sum()
x, y = x[voiced_indices], y[voiced_indices]
if linear_domain:
x, y = _exp(x), _exp(y)
z = x - y
s += (z * z).sum()
return math.sqrt(float(s) / float(T))
[docs]def vuv_error(src_vuv, tgt_vuv, lengths=None):
"""Voice/unvoiced error rate computation
Args:
src_vuv (ndarray): Input voiced/unvoiced flag array shape can be either
of (``T``, ), (``B x T``) or (``B x T x 1``).
tgt_vuv (ndarray): Target voiced/unvoiced flag array shape can be either
of (``T``, ), (``B x T``) or (``B x T x 1``).
lengths (list): Lengths of padded inputs. This should only be specified
if you give mini-batch inputs.
Returns:
float: voiced/unvoiced error rate (0 ~ 1).
"""
if lengths is None:
T = np.prod(src_vuv.shape)
return float((src_vuv != tgt_vuv).sum()) / float(T)
T = _sum(lengths)
s = 0.0
for x, y, length in zip(src_vuv, tgt_vuv, lengths):
x, y = x[:length], y[:length]
s += (x != y).sum()
return float(s) / float(T)