Source code for ttslearn.tacotron.frontend.openjtalk

import re

# 音素 (+pau/sil)
phonemes = [
    "A",
    "E",
    "I",
    "N",
    "O",
    "U",
    "a",
    "b",
    "by",
    "ch",
    "cl",
    "d",
    "dy",
    "e",
    "f",
    "g",
    "gy",
    "h",
    "hy",
    "i",
    "j",
    "k",
    "ky",
    "m",
    "my",
    "n",
    "ny",
    "o",
    "p",
    "py",
    "r",
    "ry",
    "s",
    "sh",
    "t",
    "ts",
    "ty",
    "u",
    "v",
    "w",
    "y",
    "z",
    "pau",
    "sil",
]

extra_symbols = [
    "^",  # 文の先頭を表す特殊記号 <SOS>
    "$",  # 文の末尾を表す特殊記号 <EOS> (通常)
    "?",  # 文の末尾を表す特殊記号 <EOS> (疑問系)
    "_",  # ポーズ
    "#",  # アクセント句境界
    "[",  # ピッチの上がり位置
    "]",  # ピッチの下がり位置
]

_pad = "~"

# NOTE: 0 をパディングを表す数値とする
symbols = [_pad] + extra_symbols + phonemes


_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}


def numeric_feature_by_regex(regex, s):
    match = re.search(regex, s)
    if match is None:
        return -50
    return int(match.group(1))


[docs]def pp_symbols(labels, drop_unvoiced_vowels=True):
    """Extract phoneme + prosoody symbol sequence from input full-context labels

    The algorithm is based on [Kurihara 2021] [1]_ with some tweaks.

    Args:
        labels (HTSLabelFile): List of labels
        drop_unvoiced_vowels (bool): Drop unvoiced vowels. Defaults to True.

    Returns:
        list: List of phoneme + prosody symbols

    .. ipython::

        In [11]: import ttslearn

        In [12]: from nnmnkwii.io import hts

        In [13]: from ttslearn.tacotron.frontend.openjtalk import pp_symbols

        In [14]: labels = hts.load(ttslearn.util.example_label_file())

        In [15]: " ".join(pp_symbols(labels.contexts))
        Out[15]: '^ m i [ z u o # m a [ r e ] e sh i a k a r a ... $'

    .. [1] K. Kurihara, N. Seiyama, and T. Kumano, “Prosodic features control by
        symbols as input of sequence-to-sequence acoustic modeling for neural tts,”
        IEICE Transactions on Information and Systems, vol. E104.D, no. 2,
        pp. 302–311, 2021.
    """
    PP = []
    N = len(labels)

    # 各音素毎に順番に処理
    for n in range(N):
        lab_curr = labels[n]

        # 当該音素
        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)  # type: ignore

        # 無声化母音を通常の母音として扱う
        if drop_unvoiced_vowels and p3 in "AEIOU":
            p3 = p3.lower()

        # 先頭と末尾の sil のみ例外対応
        if p3 == "sil":
            assert n == 0 or n == N - 1
            if n == 0:
                PP.append("^")
            elif n == N - 1:
                # 疑問系かどうか
                e3 = numeric_feature_by_regex(r"!(\d+)_", lab_curr)
                if e3 == 0:
                    PP.append("$")
                elif e3 == 1:
                    PP.append("?")
            continue
        elif p3 == "pau":
            PP.append("_")
            continue
        else:
            PP.append(p3)

        # アクセント型および位置情報（前方または後方）
        a1 = numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
        a2 = numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
        a3 = numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
        # アクセント句におけるモーラ数
        f1 = numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)

        a2_next = numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])

        # アクセント句境界
        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
            PP.append("#")
        # ピッチの立ち下がり（アクセント核）
        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
            PP.append("]")
        # ピッチの立ち上がり
        elif a2 == 1 and a2_next == 2:
            PP.append("[")

    return PP


[docs]def num_vocab():
    """Get number of vocabraries

    Returns:
        int: Number of vocabraries

    Examples:

        >>> from ttslearn.tacotron.frontend.openjtalk import num_vocab
        >>> num_vocab()
        >>> 52
    """
    return len(symbols)


[docs]def text_to_sequence(text):
    """Convert phoneme + prosody symbols to sequence of numbers

    Args:
        text (list): text as a list of phoneme + prosody symbols

    Returns:
        list: List of numbers

    Examples:

        >>> from ttslearn.tacotron.frontend.openjtalk import text_to_sequence
        >>> text_to_sequence(["^", "m", "i", "[", "z","o", "$"])
        >>> [1, 31, 27, 6, 49, 35, 2]
    """
    return [_symbol_to_id[s] for s in text]


[docs]def sequence_to_text(seq):
    """Convert sequence of numbers to phoneme + prosody symbols

    Args:
        seq (list): Input sequence of numbers

    Returns:
        list: List of phoneme + prosody symbols

    Examples:

        >>> from ttslearn.tacotron.frontend.openjtalk import sequence_to_text
        >>> sequence_to_text([1, 31, 27, 6, 49, 35, 2])
        >>> ['^', 'm', 'i', '[', 'z', 'o', '$']
    """
    return [_id_to_symbol[s] for s in seq]