Source code for ttslearn.tacotron.encoder

# Acknowledgement: some of the code was adapted from ESPnet
#  Copyright 2019 Nagoya University (Tomoki Hayashi)
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


def encoder_init(m):
    if isinstance(m, nn.Conv1d):
        nn.init.xavier_uniform_(m.weight, nn.init.calculate_gain("relu"))


[docs]class Encoder(nn.Module): """Encoder of Tacotron 2 Args: num_vocab (int): number of vocabularies embed_dim (int): dimension of embeddings hidden_dim (int): dimension of hidden units conv_layers (int): number of convolutional layers conv_channels (int): number of convolutional channels conv_kernel_size (int): size of convolutional kernel dropout (float): dropout rate """ def __init__( self, num_vocab=51, # 語彙数 embed_dim=512, # 文字埋め込みの次元数 hidden_dim=512, # 隠れ層の次元数 conv_layers=3, # 畳み込み層数 conv_channels=512, # 畳み込み層のチャネル数 conv_kernel_size=5, # 畳み込み層のカーネルサイズ dropout=0.5, # Dropout 率 ): super(Encoder, self).__init__() # 文字の埋め込み表現 self.embed = nn.Embedding(num_vocab, embed_dim, padding_idx=0) # 1 次元畳み込みの重ね合わせ:局所的な時間依存関係のモデル化 convs = nn.ModuleList() for layer in range(conv_layers): in_channels = embed_dim if layer == 0 else conv_channels convs += [ nn.Conv1d( in_channels, conv_channels, conv_kernel_size, padding=(conv_kernel_size - 1) // 2, bias=False, # この bias は不要です ), nn.BatchNorm1d(conv_channels), nn.ReLU(), nn.Dropout(dropout), ] self.convs = nn.Sequential(*convs) # Bi-LSTM による長期依存関係のモデル化 self.blstm = nn.LSTM( conv_channels, hidden_dim // 2, 1, batch_first=True, bidirectional=True ) # initialize self.apply(encoder_init)
[docs] def forward(self, seqs, in_lens): """Forward step Args: seqs (torch.Tensor): input sequences in_lens (torch.Tensor): input sequence lengths Returns: torch.Tensor: encoded sequences """ emb = self.embed(seqs) # 1 次元畳み込みと embedding では、入力の shape が異なるので注意 out = self.convs(emb.transpose(1, 2)).transpose(1, 2) # Bi-LSTM の計算 out = pack_padded_sequence(out, in_lens.to("cpu"), batch_first=True) out, _ = self.blstm(out) out, _ = pad_packed_sequence(out, batch_first=True) return out