Source code for nnmnkwii.baseline.gmm

import numpy as np
from nnmnkwii.paramgen import mlpg
from scipy import linalg
from sklearn.mixture import GaussianMixture


# ref: https://github.com/scikit-learn/scikit-learn/blob/0.24.1/sklearn/mixture/
def _compute_precision_cholesky(covariances, covariance_type):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    if covariance_type == "full":
        n_components, n_features, _ = covariances.shape
        precisions_chol = np.empty((n_components, n_features, n_features))
        for k, covariance in enumerate(covariances):
            try:
                cov_chol = linalg.cholesky(covariance, lower=True)
            except linalg.LinAlgError:
                raise ValueError(estimate_precision_error_message)
            precisions_chol[k] = linalg.solve_triangular(
                cov_chol, np.eye(n_features), lower=True
            ).T
    elif covariance_type == "tied":
        _, n_features = covariances.shape
        try:
            cov_chol = linalg.cholesky(covariances, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T
    else:
        if np.any(np.less_equal(covariances, 0.0)):
            raise ValueError(estimate_precision_error_message)
        precisions_chol = 1.0 / np.sqrt(covariances)
    return precisions_chol


# TODO: this can be refactored to be more flexible
# e.g. take `swap` and `diff` out of the class


class MLPGBase(object):
    def __init__(self, gmm, swap=False, diff=False):
        assert gmm.covariance_type == "full"
        # D: static + delta dim
        D = gmm.means_.shape[1] // 2
        self.num_mixtures = gmm.means_.shape[0]
        self.weights = gmm.weights_

        # Split source and target parameters from joint GMM
        self.src_means = gmm.means_[:, :D]
        self.tgt_means = gmm.means_[:, D:]
        self.covarXX = gmm.covariances_[:, :D, :D]
        self.covarXY = gmm.covariances_[:, :D, D:]
        self.covarYX = gmm.covariances_[:, D:, :D]
        self.covarYY = gmm.covariances_[:, D:, D:]

        if diff:
            self.tgt_means = self.tgt_means - self.src_means
            self.covarYY = self.covarXX + self.covarYY - self.covarXY - self.covarYX
            self.covarXY = self.covarXY - self.covarXX
            self.covarYX = self.covarXY.transpose(0, 2, 1)

        # swap src and target parameters
        if swap:
            self.tgt_means, self.src_means = self.src_means, self.tgt_means
            self.covarYY, self.covarXX = self.covarXX, self.covarYY
            self.covarYX, self.covarXY = self.covarXY, self.covarYX

        # p(x), which is used to compute posterior prob. for a given source
        # spectral feature in mapping stage.
        self.px = GaussianMixture(
            n_components=self.num_mixtures, covariance_type="full"
        )
        self.px.means_ = self.src_means
        self.px.covariances_ = self.covarXX
        self.px.weights_ = self.weights
        self.px.precisions_cholesky_ = _compute_precision_cholesky(
            self.px.covariances_, "full"
        )

    def transform(self, src):
        if src.ndim == 2:
            tgt = np.zeros_like(src)
            for idx, x in enumerate(src):
                y = self._transform_frame(x)
                tgt[idx][: len(y)] = y
            return tgt
        else:
            return self._transform_frame(src)

    def _transform_frame(self, src):
        """Mapping source spectral feature x to target spectral feature y
        so that minimize the mean least squared error.
        More specifically, it returns the value E(p(y|x)].

        Args:
            src (array): shape (`order of spectral feature`) source speaker's
                spectral feature that will be transformed

        Returns:
            array: converted spectral feature
        """
        D = len(src)

        # Eq.(11)
        E = np.zeros((self.num_mixtures, D))
        for m in range(self.num_mixtures):
            xx = np.linalg.solve(self.covarXX[m], src - self.src_means[m])
            E[m] = self.tgt_means[m] + self.covarYX[m].dot(xx)

        # Eq.(9) p(m|x)
        posterior = self.px.predict_proba(np.atleast_2d(src))

        # Eq.(13) conditinal mean E[p(y|x)]
        return posterior.dot(E).flatten()


[docs]class MLPG(MLPGBase): """Maximum likelihood Parameter Generation (MLPG) for GMM-basd voice conversion [1]_. Notes: - Source speaker's feature: ``X = {x_t}, 0 <= t < T`` - Target speaker's feature: ``Y = {y_t}, 0 <= t < T`` where T is the number of time frames. See papar [1]_ for details. The code was adapted from https://gist.github.com/r9y9/88bda659c97f46f42525. Args: gmm (sklearn.mixture.GaussianMixture): Gaussian Mixture Models of source and target joint features. windows (list): List of windows. See :func:`nnmnkwii.functions.mlpg` for details. swap (bool): If True, source -> target, otherwise target -> source. diff (bool): Convert GMM -> DIFFGMM if True. Attributes: num_mixtures (int): The number of Gaussian mixtures weights (array): shape (`num_mixtures`), weights for each gaussian src_means (array): shape (`num_mixtures`, `order of spectral feature`) means of GMM for a source speaker tgt_means (array): shape (`num_mixtures`, `order of spectral feature`) means of GMM for a target speaker covarXX (array): shape (`num_mixtures`, `order of spectral feature`, `order of spectral feature`) variance matrix of source speaker's spectral feature covarXY (array): shape (`num_mixtures`, `order of spectral feature`, `order of spectral feature`) covariance matrix of source and target speaker's spectral feature covarYX (array): shape (`num_mixtures`, `order of spectral feature`, `order of spectral feature`) covariance matrix of target and source speaker's spectral feature covarYY (array): shape (`num_mixtures`, `order of spectral feature`, `order of spectral feature`) variance matrix of target speaker's spectral feature D (array): shape (`num_mixtures`, `order of spectral feature`, `order of spectral feature`) covariance matrices of target static spectral features px (sklearn.mixture.GaussianMixture): Gaussian Mixture Models of source speaker's features Examples: >>> from sklearn.mixture import GaussianMixture >>> from nnmnkwii.baseline.gmm import MLPG >>> import numpy as np >>> static_dim, T = 24, 10 >>> windows = [ ... (0, 0, np.array([1.0])), ... (1, 1, np.array([-0.5, 0.0, 0.5])), ... (1, 1, np.array([1.0, -2.0, 1.0])), ... ] >>> src = np.random.rand(T, static_dim * len(windows)) >>> tgt = np.random.rand(T, static_dim * len(windows)) >>> XY = np.concatenate((src, tgt), axis=-1) # pseudo parallel data >>> gmm = GaussianMixture(n_components=4) >>> _ = gmm.fit(XY) >>> paramgen = MLPG(gmm, windows=windows) >>> generated = paramgen.transform(src) >>> assert generated.shape == (T, static_dim) See also: :class:`nnmnkwii.preprocessing.alignment.IterativeDTWAligner`. .. [1] [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation of Spectral Parameter Trajectory. """ def __init__(self, gmm, windows=None, swap=False, diff=False): super(MLPG, self).__init__(gmm, swap, diff) if windows is None: windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), ] self.windows = windows self.static_dim = gmm.means_.shape[-1] // 2 // len(windows)
[docs] def transform(self, src): """Mapping source feature x to target feature y so that maximize the likelihood of y given x. Args: src (array): shape (`the number of frames`, `the order of spectral feature`) a sequence of source speaker's spectral feature that will be transformed. Returns: array: a sequence of transformed features """ T, feature_dim = src.shape[0], src.shape[1] if feature_dim == self.static_dim: return super(MLPG, self).transform(src) # A suboptimum mixture sequence (eq.37) optimum_mix = self.px.predict(src) # Compute E eq.(40) E = np.empty((T, feature_dim)) for t in range(T): m = optimum_mix[t] # estimated mixture index at time t xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m]) # Eq. (22) E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx) # Compute D eq.(23) # Approximated variances with diagonals so that we can do MLPG # efficiently in dimention-wise manner D = np.empty((T, feature_dim)) for t in range(T): m = optimum_mix[t] # Eq. (23), with approximating covariances as diagonals D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / np.diag( self.covarXX[m] ) * np.diag(self.covarXY[m]) # Once we have mean and variance over frames, then we can do MLPG return mlpg(E, D, self.windows)