Source code for ttslearn.tacotron.tts

import json
from pathlib import Path

import numpy as np
import pyopenjtalk
import torch
from hydra.utils import instantiate
from omegaconf import OmegaConf
from tqdm import tqdm
from ttslearn.dsp import inv_mulaw_quantize, logmelspectrogram_to_audio
from ttslearn.pretrained import retrieve_pretrained_model
from ttslearn.tacotron.frontend.openjtalk import pp_symbols, text_to_sequence
from ttslearn.util import StandardScaler


[docs]class Tacotron2TTS(object):
    """Tacotron 2 based text-to-speech

    Args:
        model_dir (str): model directory. A pre-trained model (ID: ``tacotron2``)
            is used if None.
        device (str): cpu or cuda.

    Examples:

        >>> from ttslearn.tacotron import Tacotron2TTS
        >>> engine = Tacotron2TTS()
        >>> wav, sr = engine.tts("一貫学習にチャレンジしましょう！")
    """

    def __init__(self, model_dir=None, device="cpu"):
        self.device = device

        if model_dir is None:
            model_dir = retrieve_pretrained_model("tacotron2")
        if isinstance(model_dir, str):
            model_dir = Path(model_dir)

        # search for config.yaml
        if (model_dir / "config.yaml").exists():
            config = OmegaConf.load(model_dir / "config.yaml")
            self.sample_rate = config.sample_rate
            self.mu = config.mu
        else:
            self.sample_rate = 16000
            self.mu = 255

        # 音響モデル
        self.acoustic_config = OmegaConf.load(model_dir / "acoustic_model.yaml")
        self.acoustic_model = instantiate(self.acoustic_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "acoustic_model.pth",
            map_location=device,
        )
        self.acoustic_model.load_state_dict(checkpoint["state_dict"])
        self.acoustic_out_scaler = StandardScaler(
            np.load(model_dir / "out_tacotron_scaler_mean.npy"),
            np.load(model_dir / "out_tacotron_scaler_var.npy"),
            np.load(model_dir / "out_tacotron_scaler_scale.npy"),
        )
        self.acoustic_model.eval()

        # WaveNet vocoder
        self.wavenet_config = OmegaConf.load(model_dir / "wavenet_model.yaml")
        self.wavenet_model = instantiate(self.wavenet_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "wavenet_model.pth",
            map_location=device,
        )
        self.wavenet_model.load_state_dict(checkpoint["state_dict"])
        self.wavenet_model.eval()
        self.wavenet_model.remove_weight_norm_()

    def __repr__(self):
        acoustic_str = json.dumps(
            OmegaConf.to_container(self.acoustic_config["netG"]),
            sort_keys=False,
            indent=4,
        )
        wavenet_str = json.dumps(
            OmegaConf.to_container(self.wavenet_config["netG"]),
            sort_keys=False,
            indent=4,
        )

        return f"""Tacotron2 TTS (sampling rate: {self.sample_rate})

Acoustic model: {acoustic_str}
Vocoder model: {wavenet_str}
"""

[docs]    def set_device(self, device):
        """Set device for the TTS models

        Args:
            device (str): cpu or cuda.
        """
        self.device = device
        self.acoustic_model.to(device)
        self.wavenet_model.to(device)

[docs]    @torch.no_grad()
    def tts(self, text, griffin_lim=False, tqdm=tqdm):
        """Run TTS

        Args:
            text (str): Input text
            griffin_lim (bool, optional): Use Griffin-Lim algorithm or not. Defaults to False.
            tqdm (object, optional): tqdm object. Defaults to None.

        Returns:
            tuple: audio array (np.int16) and sampling rate (int)
        """
        # OpenJTalkを用いて言語特徴量の抽出
        contexts = pyopenjtalk.extract_fullcontext(text)
        # 韻律記号付き音素列に変換
        in_feats = text_to_sequence(pp_symbols(contexts))
        in_feats = torch.tensor(in_feats, dtype=torch.long).to(self.device)

        # (T, C)
        _, out_feats, _, _ = self.acoustic_model.inference(in_feats)

        if griffin_lim:
            # Griffin-Lim のアルゴリズムに基づく音声波形合成
            out_feats = out_feats.cpu().data.numpy()
            # 正規化の逆変換
            logmel = self.acoustic_out_scaler.inverse_transform(out_feats)
            gen_wav = logmelspectrogram_to_audio(logmel, self.sample_rate)
        else:
            # (B, T, C) -> (B, C, T)
            c = out_feats.view(1, -1, out_feats.size(-1)).transpose(1, 2)

            # 音声波形の長さを計算
            upsample_scale = np.prod(self.wavenet_model.upsample_scales)
            T = (
                c.shape[-1] - self.wavenet_model.aux_context_window * 2
            ) * upsample_scale

            # WaveNet ボコーダによる音声波形の生成
            # NOTE: 計算に時間を要するため、tqdm によるプログレスバーを利用します
            gen_wav = self.wavenet_model.inference(c, T, tqdm)

            # One-hot ベクトルから1次元の信号に変換
            gen_wav = gen_wav.max(1)[1].float().cpu().numpy().reshape(-1)

            # Mu-law 量子化の逆変換
            # NOTE: muは出力チャンネル数-1だと仮定
            gen_wav = inv_mulaw_quantize(gen_wav, self.wavenet_model.out_channels - 1)

        return self.post_process(gen_wav), self.sample_rate

    def post_process(self, wav):
        wav = np.clip(wav, -1.0, 1.0)
        wav = (wav * 32767.0).astype(np.int16)
        return wav


def randomize_tts_engine_(engine: Tacotron2TTS) -> Tacotron2TTS:
    # アテンションのパラメータの一部を強制的に乱数で初期化することで、学習済みモデルを破壊する
    torch.nn.init.normal_(engine.acoustic_model.decoder.attention.mlp_dec.weight.data)
    return engine