Source code for nnmnkwii.metrics

from __future__ import absolute_import, print_function, with_statement

import math

import numpy as np

_logdb_const = 10.0 / np.log(10.0) * np.sqrt(2.0)


# should work on torch and numpy arrays
def _sqrt(x):
    isnumpy = isinstance(x, np.ndarray)
    isscalar = np.isscalar(x)
    return np.sqrt(x) if isnumpy else math.sqrt(x) if isscalar else x.sqrt()


def _exp(x):
    isnumpy = isinstance(x, np.ndarray)
    isscalar = np.isscalar(x)
    return np.exp(x) if isnumpy else math.exp(x) if isscalar else x.exp()


def _sum(x):
    if isinstance(x, list) or isinstance(x, np.ndarray):
        return np.sum(x)
    return float(x.sum())


[docs]def melcd(X, Y, lengths=None):
    """Mel-cepstrum distortion (MCD).

    The function computes MCD for time-aligned mel-cepstrum sequences.

    Args:
        X (ndarray): Input mel-cepstrum, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        Y (ndarray): Target mel-cepstrum, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        lengths (list): Lengths of padded inputs. This should only be specified
          if you give mini-batch inputs.

    Returns:
        float: Mean mel-cepstrum distortion in dB.

    .. note::

        The function doesn't check if inputs are actually mel-cepstrum.
    """
    # summing against feature axis, and then take mean against time axis
    # Eq. (1a)
    # https://www.cs.cmu.edu/~awb/papers/sltu2008/kominek_black.sltu_2008.pdf
    if lengths is None:
        z = X - Y
        r = _sqrt((z * z).sum(-1))
        if not np.isscalar(r):
            r = r.mean()
        return _logdb_const * float(r)

    # Case for 1-dim features.
    if len(X.shape) == 2:
        # Add feature axis
        X, Y = X[:, :, None], Y[:, :, None]

    s = 0.0
    T = _sum(lengths)
    for x, y, length in zip(X, Y, lengths):
        x, y = x[:length], y[:length]
        z = x - y
        s += _sqrt((z * z).sum(-1)).sum()

    return _logdb_const * float(s) / float(T)


[docs]def mean_squared_error(X, Y, lengths=None):
    """Mean squared error (MSE).

    Args:
        X (ndarray): Input features, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        Y (ndarray): Target features, shape can be either of
          (``D``,), (``T x D``) or (``B x T x D``). Both Numpy and torch arrays
          are supported.
        lengths (list): Lengths of padded inputs. This should only be specified
          if you give mini-batch inputs.

    Returns:
        float: Mean squared error.

    .. tip::

        The function supports 3D padded inputs, while
        :func:`sklearn.metrics.mean_squared_error` doesn't support.
    """
    if lengths is None:
        z = X - Y
        return math.sqrt(float((z * z).mean()))

    T = _sum(lengths) * X.shape[-1]
    s = 0.0
    for x, y, length in zip(X, Y, lengths):
        x, y = x[:length], y[:length]
        z = x - y
        s += (z * z).sum()

    return math.sqrt(float(s) / float(T))


[docs]def lf0_mean_squared_error(
    src_f0, src_vuv, tgt_f0, tgt_vuv, lengths=None, linear_domain=False
):
    """Mean squared error (MSE) for log-F0 sequences.

    MSE is computed for voiced segments.

    Args:
        src_f0 (ndarray): Input log-F0 sequences, shape can be either of
          (``T``,), (``B x T``) or (``B x T x 1``). Both Numpy and torch arrays
          are supported.
        src_vuv (ndarray): Input voiced/unvoiced flag array, shape can be either
          of (``T``, ), (``B x T``) or (``B x T x 1``).
        tgt_f0 (ndarray): Target log-F0 sequences, shape can be either of
          (``T``,), (``B x T``) or (``B x T x 1``). Both Numpy and torch arrays
          are supported.
        tgt_vuv (ndarray): Target voiced/unvoiced flag array, shape can be either
          of (``T``, ), (``B x T``) or (``B x T x 1``).
        lengths (list): Lengths of padded inputs. This should only be specified
          if you give mini-batch inputs.
        linear_domain (bool): Whether computes MSE on linear frequecy domain or
          log-frequency domain.

    Returns:
        float: mean squared error.
    """

    if lengths is None:
        voiced_indices = (src_vuv + tgt_vuv) >= 2
        x = src_f0[voiced_indices]
        y = tgt_f0[voiced_indices]
        if linear_domain:
            x, y = _exp(x), _exp(y)
        return mean_squared_error(x, y)

    T = 0
    s = 0.0
    for x, x_vuv, y, y_vuv, length in zip(src_f0, src_vuv, tgt_f0, tgt_vuv, lengths):
        x, x_vuv = x[:length], x_vuv[:length]
        y, y_vuv = y[:length], y_vuv[:length]
        voiced_indices = (x_vuv + y_vuv) >= 2
        T += voiced_indices.sum()
        x, y = x[voiced_indices], y[voiced_indices]
        if linear_domain:
            x, y = _exp(x), _exp(y)
        z = x - y
        s += (z * z).sum()

    return math.sqrt(float(s) / float(T))


[docs]def vuv_error(src_vuv, tgt_vuv, lengths=None):
    """Voice/unvoiced error rate computation

    Args:
        src_vuv (ndarray): Input voiced/unvoiced flag array shape can be either
          of (``T``, ), (``B x T``) or (``B x T x 1``).
        tgt_vuv (ndarray): Target voiced/unvoiced flag array shape can be either
          of (``T``, ), (``B x T``) or (``B x T x 1``).
        lengths (list): Lengths of padded inputs. This should only be specified
          if you give mini-batch inputs.

    Returns:
        float: voiced/unvoiced error rate (0 ~ 1).
    """
    if lengths is None:
        T = np.prod(src_vuv.shape)
        return float((src_vuv != tgt_vuv).sum()) / float(T)

    T = _sum(lengths)
    s = 0.0
    for x, y, length in zip(src_vuv, tgt_vuv, lengths):
        x, y = x[:length], y[:length]
        s += (x != y).sum()
    return float(s) / float(T)