Source code for ttslearn.wavenet.wavenet

import torch
from torch import nn
from torch.nn import functional as F
from ttslearn.dsp import mulaw_quantize
from ttslearn.wavenet.modules import Conv1d1x1, ResSkipBlock
from ttslearn.wavenet.upsample import ConvInUpsampleNetwork


[docs]class WaveNet(nn.Module):
    """WaveNet

    Args:
        out_channels (int): the number of output channels
        layers (int): the number of layers
        stacks (int): the number of residual stacks
        residual_channels (int): the number of residual channels
        gate_channels (int): the number of channels for the gating function
        skip_out_channels (int): the number of channels in the skip output
        kernel_size (int): the size of the convolutional kernel
        cin_channels (int): the number of input channels for local conditioning
        upsample_scales (list): the list of scales to upsample the local conditioning features
        aux_context_window (int): the number of context frames
    """

    def __init__(
        self,
        out_channels=256,  # 出力のチャネル数
        layers=30,  # レイヤー数
        stacks=3,  # 畳み込みブロックの数
        residual_channels=64,  # 残差結合のチャネル数
        gate_channels=128,  # ゲートのチャネル数
        skip_out_channels=64,  # スキップ接続のチャネル数
        kernel_size=2,  # 1 次元畳み込みのカーネルサイズ
        cin_channels=80,  # 条件付け特徴量のチャネル数
        upsample_scales=None,  # アップサンプリングのスケール
        aux_context_window=0,  # アップサンプリング時に参照する近傍フレーム数
    ):
        super().__init__()
        self.out_channels = out_channels
        self.cin_channels = cin_channels
        self.aux_context_window = aux_context_window
        if upsample_scales is None:
            upsample_scales = [10, 8]
        self.upsample_scales = upsample_scales

        self.first_conv = Conv1d1x1(out_channels, residual_channels)

        # メインとなる畳み込み層
        self.main_conv_layers = nn.ModuleList()
        layers_per_stack = layers // stacks
        for layer in range(layers):
            dilation = 2 ** (layer % layers_per_stack)
            conv = ResSkipBlock(
                residual_channels,
                gate_channels,
                kernel_size,
                skip_out_channels,
                dilation=dilation,
                cin_channels=cin_channels,
            )
            self.main_conv_layers.append(conv)

        # スキップ接続の和から波形への変換
        self.last_conv_layers = nn.ModuleList(
            [
                nn.ReLU(),
                Conv1d1x1(skip_out_channels, skip_out_channels),
                nn.ReLU(),
                Conv1d1x1(skip_out_channels, out_channels),
            ]
        )

        # フレーム単位の特徴量をサンプル単位にアップサンプリング
        self.upsample_net = ConvInUpsampleNetwork(
            upsample_scales, cin_channels, aux_context_window
        )

[docs]    def forward(self, x, c):
        """Forward step

        Args:
            x (torch.Tensor): the input waveform
            c (torch.Tensor): the local conditioning feature

        Returns:
            torch.Tensor: the output waveform
        """
        # 量子化された離散値列から One-hot ベクトルに変換
        # (B, T) -> (B, T, out_channels) -> (B, out_channels, T)
        x = F.one_hot(x, self.out_channels).transpose(1, 2).float()

        # 条件付き特徴量のアップサンプリング
        c = self.upsample_net(c)
        assert c.size(-1) == x.size(-1)

        # One-hot ベクトルの次元から隠れ層の次元に変換
        x = self.first_conv(x)

        # メインの畳み込み層の処理
        # 各層におけるスキップ接続の出力を加算して保持
        skips = 0
        for f in self.main_conv_layers:
            x, h = f(x, c)
            skips += h

        # スキップ接続の和を入力として、出力を計算
        x = skips
        for f in self.last_conv_layers:
            x = f(x)

        # NOTE: 出力を確率値として解釈する場合には softmax が必要ですが、
        # 学習時には nn.CrossEntropyLoss の計算に置いて softmax の計算が行われるので、
        # ここでは明示的に softmax を計算する必要はありません
        return x

[docs]    def inference(self, c, num_time_steps=100, tqdm=lambda x: x):
        """Inference step

        Args:
            c (torch.Tensor): the local conditioning feature
            num_time_steps (int): the number of time steps to generate
            tqdm (lambda): a tqdm function to track progress

        Returns:
            torch.Tensor: the output waveform
        """
        self.clear_buffer()

        # Local conditioning
        B = c.shape[0]
        # (B, C, T)
        c = self.upsample_net(c)
        # (B, C, T) -> (B, T, C)
        c = c.transpose(1, 2).contiguous()

        outputs = []

        # 自己回帰生成における初期値
        current_input = torch.zeros(B, 1, self.out_channels).to(c.device)
        current_input[:, :, int(mulaw_quantize(0))] = 1

        if tqdm is None:
            ts = range(num_time_steps)
        else:
            ts = tqdm(range(num_time_steps))

        # 逐次的に生成
        for t in ts:
            # 時刻 t における入力は、時刻 t-1 における出力
            if t > 0:
                current_input = outputs[-1]

            # 時刻 t における条件付け特徴量
            ct = c[:, t, :].unsqueeze(1)

            x = current_input
            x = self.first_conv.incremental_forward(x)
            skips = 0
            for f in self.main_conv_layers:
                x, h = f.incremental_forward(x, ct)
                skips += h
            x = skips
            for f in self.last_conv_layers:
                if hasattr(f, "incremental_forward"):
                    x = f.incremental_forward(x)
                else:
                    x = f(x)
            # Softmax によって、出力をカテゴリカル分布のパラメータに変換
            x = F.softmax(x.view(B, -1), dim=1)
            # カテゴリカル分布からサンプリング
            x = torch.distributions.OneHotCategorical(x).sample()
            outputs += [x.data]

        # T x B x C
        # 各時刻における出力を結合
        outputs = torch.stack(outputs)
        # B x C x T
        outputs = outputs.transpose(0, 1).transpose(1, 2).contiguous()

        self.clear_buffer()
        return outputs

[docs]    def clear_buffer(self):
        """Clear the internal buffer."""
        self.first_conv.clear_buffer()
        for f in self.main_conv_layers:
            f.clear_buffer()
        for f in self.last_conv_layers:
            try:
                f.clear_buffer()
            except AttributeError:
                pass

[docs]    def remove_weight_norm_(self):
        """Remove weight normalization of the model"""

        def _remove_weight_norm(m):
            try:
                torch.nn.utils.remove_weight_norm(m)
            except ValueError:
                return

        self.apply(_remove_weight_norm)