Source code for ttslearn.contrib.tacotron2_pwg

import json
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np
import pyopenjtalk
import torch
from hydra.utils import instantiate
from omegaconf import OmegaConf
from parallel_wavegan.utils import load_model
from ttslearn.pretrained import retrieve_pretrained_model
from ttslearn.tacotron.frontend.openjtalk import pp_symbols, text_to_sequence


[docs]class Tacotron2PWGTTS(object):
    """Fast Tacotron 2 based text-to-speech with Parallel WaveGAN

    The WaveNet vocoder in Tacotron 2 is replaced with Parallel WaveGAN for
    fast real-time inference.
    Both single-speaker and multi-speaker Tacotron are supported.

    Args:
        model_dir (str): model directory. A pre-trained model (ID: ``tacotron2_pwg_jsut24k``)
            is used if None.
        device (str): cpu or cuda.

    Examples:

        Singler-speaker TTS

        >>> from ttslearn.contrib import Tacotron2PWGTTS
        >>> engine = Tacotron2PWGTTS()
        >>> wav, sr = engine.tts("発展的な音声合成です！")

        Multi-speaker TTS

        >>> from ttslearn.contrib import Tacotron2PWGTTS
        >>> from ttslearn.pretrained import retrieve_pretrained_model
        >>> model_dir = retrieve_pretrained_model("multspk_tacotron2_pwg_jvs24k")
        >>> engine = Tacotron2PWGTTS(model_dir)
        >>> wav, sr = engine.tts("じぇーぶいえすコーパス10番目の話者です。", spk_id=10)

    .. note::

        This class supports not only `Parallel WaveGAN`_ but also any models supported in
        `kan-bayashi/ParallelWaveGAN`_.
        For example, HifiGAN or MelGAN can also be used without any change.

        .. _Parallel WaveGAN: https://arxiv.org/abs/1910.11480
        .. _kan-bayashi/ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN
    """

    def __init__(self, model_dir=None, device="cpu"):
        self.device = device

        if model_dir is None:
            model_dir = retrieve_pretrained_model("tacotron2_pwg_jsut16k")
        if isinstance(model_dir, str):
            model_dir = Path(model_dir)

        # search for config.yaml
        if (model_dir / "config.yaml").exists():
            config = OmegaConf.load(model_dir / "config.yaml")
            self.sample_rate = config.sample_rate
        else:
            self.sample_rate = 16000

        # 話者情報 (for multi-speaker models)
        if (model_dir / "spks").exists():
            # 話者名を取得
            with open(model_dir / "spks", "r") as f:
                self.spks: Optional[List[str]] = [line.strip() for line in f]
            # 話者名と話者ID（数値）の辞書の構築
            with open(model_dir / "spk2id") as f:
                self.spk2id: Optional[Dict[str, int]] = {}
                for line in f:
                    k, v = line.strip().split(":")
                    self.spk2id[k] = int(v)
        else:
            self.spks, self.spk2id = None, None

        # 音響モデル
        self.acoustic_config = OmegaConf.load(model_dir / "acoustic_model.yaml")
        self.acoustic_model = instantiate(self.acoustic_config.netG).to(device)
        checkpoint = torch.load(
            model_dir / "acoustic_model.pth",
            map_location=device,
        )
        self.acoustic_model.load_state_dict(checkpoint["state_dict"])
        self.acoustic_model.eval()

        # parallel_wavegan
        self.vocoder_config = OmegaConf.load(model_dir / "vocoder_model.yaml")
        self.vocoder = load_model(
            model_dir / "vocoder_model.pth", config=self.vocoder_config
        ).to(device)
        self.vocoder.eval()
        self.vocoder.remove_weight_norm()

    def __repr__(self):
        acoustic_str = json.dumps(
            OmegaConf.to_container(self.acoustic_config.netG),
            sort_keys=False,
            indent=4,
        )
        vocoder_params = {
            "generator_type": self.vocoder_config.get(
                "generator_type", "ParallelWaveGANGenerator"  # type: ignore
            ),
            "generator_params": OmegaConf.to_container(
                self.vocoder_config.generator_params
            ),
        }

        vocoder_str = json.dumps(
            vocoder_params,
            sort_keys=False,
            indent=4,
        )

        return f"""Tacotron2 TTS (sampling rate: {self.sample_rate})

Acoustic model: {acoustic_str}
Vocoder model: {vocoder_str}
"""

[docs]    def set_device(self, device):
        """Set device for the TTS models

        Args:
            device (str): cpu or cuda.
        """
        self.device = device
        self.acoustic_model.to(device)
        self.vocoder.to(device)

[docs]    @torch.no_grad()
    def tts(self, text, tqdm=None, spk_id=None):
        """Run TTS

        Args:
            text (str): Input text
            tqdm (obj): tqdm progress bar
            spk_id (int): speaker id. This should be only specified for multi-speaker models.

        Returns:
            tuple: audio array (np.int16) and sampling rate (int)
        """
        # OpenJTalkを用いて言語特徴量の抽出
        labels = pyopenjtalk.extract_fullcontext(text)
        # 音素 + 韻律記号に変換
        in_feats = text_to_sequence(pp_symbols(labels))
        in_feats = torch.tensor(in_feats, dtype=torch.long).to(self.device)

        if hasattr(self.acoustic_model, "spk_embed"):
            assert spk_id is not None
            spk_id = torch.tensor([spk_id], dtype=torch.long).to(self.device)
            _, out_feats, _, _ = self.acoustic_model.inference(in_feats, spk_id)
        else:
            # (T, C)
            _, out_feats, _, _ = self.acoustic_model.inference(in_feats)

        # parallel_wavegan による音声波形の生成
        gen_wav = self.vocoder.inference(out_feats).view(-1).to("cpu").numpy()

        return self.post_process(gen_wav), self.sample_rate

    def post_process(self, wav):
        wav = np.clip(wav, -1.0, 1.0)
        wav = (wav * 32767.0).astype(np.int16)
        return wav