Source code for ttslearn.tacotron.gen

import numpy as np
import torch
from tqdm import tqdm
from ttslearn.dsp import inv_mulaw_quantize, logmelspectrogram_to_audio
from ttslearn.tacotron.frontend.openjtalk import pp_symbols, text_to_sequence

[docs]@torch.no_grad() def synthesis_griffin_lim(device, sample_rate, labels, acoustic_model, scaler): """Synthesize waveform with Griffin-Lim algorithm. Args: device (torch.device): device to use for computation (CPU or GPU). sample_rate (int): sample rate of the output waveform. labels (list): list of labels to generate. acoustic_model (ttslearn.tacotron.models.Tacotron): acoustic model. scaler (sklearn.preprocessing.StandardScaler): preprocessing scaler. Returns: (torch.Tensor): waveform. """ in_feats = text_to_sequence(pp_symbols(labels.contexts)) in_feats = torch.tensor(in_feats, dtype=torch.long).to(device) # (T, C) _, out_feats, _, _ = acoustic_model.inference(in_feats) out_feats = out_feats.cpu().data.numpy() # Denormalization logmel = scaler.inverse_transform(out_feats) gen_wav = logmelspectrogram_to_audio(logmel, sample_rate) return gen_wav
[docs]@torch.no_grad() def synthesis(device, sample_rate, labels, acoustic_model, wavenet_model, _tqdm=tqdm): """Synthesize waveform Args: device (torch.device): device to use for computation (CPU or GPU). sample_rate (int): sample rate of the output waveform. labels (list): list of labels to generate. acoustic_model (ttslearn.tacotron.models.Tacotron): acoustic model. wavenet_model (ttslearn.wavenet.WaveNet): WaveNet vocoder. _tqdm (optional): tqdm progress bar. Returns: (torch.Tensor): waveform. """ in_feats = text_to_sequence(pp_symbols(labels.contexts)) in_feats = torch.tensor(in_feats, dtype=torch.long).to(device) # (T, C) _, out_feats, _, _ = acoustic_model.inference(in_feats) # (B, T, C) -> (B, C, T) c = out_feats.view(1, -1, out_feats.size(-1)).transpose(1, 2) # 音声波形の長さを計算 upsample_scale = time_steps = (c.shape[-1] - wavenet_model.aux_context_window * 2) * upsample_scale # WaveNetによる音声波形の生成 # NOTE: 計算に時間がかかるため、tqdmによるプログレスバーを受け付けるようにしている gen_wav = wavenet_model.inference(c, time_steps, _tqdm) # One-hotベクトルから一次元の信号に変換 gen_wav = gen_wav.max(1)[1].float().cpu().numpy().reshape(-1) # Mu-law量子化の逆変換 # NOTE: muは出力チャンネル数-1だと仮定 gen_wav = inv_mulaw_quantize(gen_wav, wavenet_model.out_channels - 1) return gen_wav