Source code for nnmnkwii.preprocessing.modspec

import numpy as np

# TODO: this may be removed in future.


[docs]def modspec(x, n=4096, norm=None, return_phase=False):
    """Modulation spectrum (MS) computation

    Given a parameter trajectory, it computes modulation spectrum. In the
    library, we define modulation spectrum as power of discrete Fourier transform
    of parameter trajectory across time-axis. See [1]_ for example application.

    .. [1] Takamichi, Shinnosuke, et al. "A postfilter to modify the modulation
      spectrum in HMM-based speech synthesis." Acoustics, Speech and Signal
      Processing (ICASSP), 2014 IEEE International Conference on. IEEE, 2014.

    .. warning::

        This may move in different module in future.

    Args:
        y (numpy.ndarray): Parameter trajectory, shape (``T x D``).
        n (int): DFT length
        norm (str): Normalization mode. See :func:`numpy.fft.fft`.
        return_phase (bool): If True, return phase of MS.

    Returns:
        tuple or numpy.ndarray: Modulation spectrum (``n//2 + 1 x D``) and
        phase (if ``return_phase`` is True).

    See also:
        :func:`nnmnkwii.preprocessing.inv_modspec`,
        :func:`nnmnkwii.autograd.modspec`

    Examples:
        >>> import numpy as np
        >>> from nnmnkwii import preprocessing as P
        >>> generated = np.random.rand(10, 2)
        >>> ms = P.modspec(generated, n=16)
        >>> ms.shape
        (9, 2)
    """
    # DFT against time axis
    s_complex = np.fft.rfft(x, n=n, axis=0, norm=norm)
    assert s_complex.shape[0] == n // 2 + 1
    R, im = s_complex.real, s_complex.imag
    ms = R * R + im * im

    # TODO: this is ugly...
    if return_phase:
        return ms, np.exp(1.0j * np.angle(s_complex))
    else:
        return ms


# For compat
def modphase(x, n=4096, norm=None):
    return modspec(x, n, norm, return_phase=True)[1]


[docs]def inv_modspec(ms, phase, norm=None):
    """Inverse transform of modulation spectrum computation

    Given an modulation spectrum and it's phase, it recovers original parameter
    trajectory.

    .. note::
        Returned parameter trajectory has shape (``n x D``), where ``n`` is DFT
        length used in modulation spectrum compuattion. You will have to
        trim it yourself to the actual time length if needed.

    .. warning::

        This may move in different module in future.

    Args:
        ms (numpy.ndarray): Modulation spectrum (``n//2 + 1 x D``).
        phase (numpy.ndarray): Phase of modulation spectrum (``n//2 + 1 x D``).
        norm (str): Normalization mode. See :func:`numpy.fft.fft`.

    Returns:
        numpy.ndarray: Recovered parameter trajectory, shape (``n x D``).

    Examples:
        >>> import numpy as np
        >>> from nnmnkwii import preprocessing as P
        >>> generated = np.random.rand(10, 2)
        >>> ms, phase = P.modspec(generated, n=16, return_phase=True)
        >>> generated_hat = P.inv_modspec(ms, phase)[:len(generated)]
        >>> assert np.allclose(generated, generated_hat)

    See also:
        :func:`nnmnkwii.preprocessing.modspec`.
    """
    n = (ms.shape[0] - 1) * 2

    # |X(x)|^2 -> |X(w)|
    amp = np.sqrt(ms)

    # X(w)
    complex_ms = amp * phase

    # x
    x = np.fft.irfft(complex_ms, n=n, norm=norm, axis=0)
    return x


[docs]def modspec_smoothing(x, modfs, n=4096, norm=None, cutoff=50, log_domain=True):
    """Parameter trajectory smoothing by removing high frequency bands of MS.

    Given an parameter trajectory, it removes high frequency bands of its
    modulation spectrum (MS).

    It's known that the effect of the MS components in high MS frequency bands
    on quality of analysis-synthesized speech is negligible in HMM-based speech
    synthesis. See [1]_ for details.

    .. [1] Takamichi, Shinnosuke, et al. "The NAIST text-to-speech system for
      the Blizzard Challenge 2015." Proc. Blizzard Challenge workshop. 2015.

    Args:
        x (numpy.ndarray): Parameter trajectory, shape (``T x D``).
        modfs (int): Sampling frequency in modulation spectrum domain. In
          frame-based processing, this will be ``fs / hop_length``.
        n (int): DFT length
        norm (str): Normalization mode. See :func:`numpy.fft.fft`.
        cutoff (float): Cut-off frequency in Hz.
        log_domain (bool): Whether it performs high frequency band removal on
          log modulation spectrum domain or not.

    Returns:
        numpy.ndarray: Smoothed parameter trajectory, shape (``T x D``).

    Examples:
        >>> import numpy as np
        >>> from nnmnkwii import preprocessing as P
        >>> generated = np.random.rand(10, 2)
        >>> smoothed = P.modspec_smoothing(generated, modfs=200, n=16, cutoff=50)
        >>> smoothed.shape
        (10, 2)
    """
    T, D = x.shape
    if cutoff > modfs // 2:
        raise ValueError(
            "Cutoff frequency {} hz must be larger than Nyquist freqeuency {}. hz".format(
                cutoff, modfs // 2
            )
        )
    if n < T:
        raise RuntimeError(
            "DFT length {} must be larger than time length {}".format(n, T)
        )

    ms, phase = modspec(x, n=n, norm=norm, return_phase=True)
    if log_domain:
        ms = np.log(ms)

    if cutoff is not None:
        limit_bin = int(n * cutoff / modfs) + 1
        if limit_bin < len(ms):
            ms[limit_bin:] = 0

    if log_domain:
        ms = np.exp(ms)

    x_hat = inv_modspec(ms, phase, norm=norm)
    return np.ascontiguousarray(x_hat[:T])