Source code for nnmnkwii.datasets.voice_statistics

from os import listdir
from os.path import exists, isdir, join, splitext

import numpy as np
from nnmnkwii.datasets import FileDataSource

available_speakers = ["fujitou", "tsuchiya", "uemura", "hiroshiba"]
available_emotions = ["angry", "happy", "normal"]


def _get_dir(speaker, emotion):
    return "{}_{}".format(speaker, emotion)


[docs]class TranscriptionDataSource(FileDataSource):
    """Transcription data source for VoiceStatistics dataset

    Users are expected to inherit the class and implement ``collect_features``
    method, which defines how features are computed given a transcription.

    Args:
        data_root (str): Data root.
        column (str): ``sentense``, ``yomi`` or ``monophone``.
        max_files (int): Total number of files to be collected.

    Atributes:
        transcriptions (list): Transcriptions.
    """

    column_map = {"sentence_id": 0, "sentence": 1, "yomi": 2, "monophone": 3}

    def __init__(self, data_root, column="sentence", max_files=None):
        path = join(data_root, "balance_sentences.txt")
        if not exists(path):
            raise RuntimeError(
                'balance_sentences.txt doesn\'t exist at "{}"'.format(path)
            )

        self.transcriptions = []
        self.max_files = max_files
        if column not in self.column_map:
            raise ValueError(
                "Not supported column {}. It should be one of 'sentense',"
                " 'yomi' or 'monophone'.".format(column)
            )
        with open(path) as f:
            for line in f:
                # header
                if line.startswith("sentence_id"):
                    continue
                v = line.split("\t")[self.column_map[column]].strip()
                self.transcriptions.append(v)
        assert len(self.transcriptions) == 100

[docs]    def collect_files(self):
        """Collect text transcriptions.

        .. warning::

            Note that it returns list of transcriptions (str), not file paths.

        Returns:
            list: List of text transcription.
        """
        if self.max_files is None:
            return self.transcriptions
        else:
            return self.transcriptions[: self.max_files]


[docs]class WavFileDataSource(FileDataSource):
    """Wav file data source for Voice-statistics dataset.

    The data source collects wav files from voice-statistics.
    Users are expected to inherit the class and implement
    ``collect_features`` method, which defines how features are computed
    given a wav file path.

    Args:
        data_root (str): Data root
        speakers (list): List of speakers to load. Supported names of speaker
          are ``fujitou``, ``tsuchiya`` and ``uemura``.
        labelmap (dict[optional]): Dict of speaker labels. If None,
          it's assigned as incrementally (i.e., 0, 1, 2) for specified
          speakers.
        max_files (int): Total number of files to be collected.
        emotions (list): List of emotions we use. Supported names of emotions
          are ``angry``, ``happy`` and ``normal``.

    Attributes:
        labels (numpy.ndarray): List of speaker identifiers determined by
          labelmap. Stored in ``collect_files``.
    """

    def __init__(
        self, data_root, speakers, labelmap=None, max_files=None, emotions=None
    ):
        if emotions is None:
            emotions = ["normal"]
        for speaker in speakers:
            if speaker not in available_speakers:
                raise ValueError(
                    "Unknown speaker '{}'. It should be one of {}".format(
                        speaker, available_speakers
                    )
                )

        for emotion in emotions:
            if emotion not in available_emotions:
                raise ValueError(
                    "Unknown emotion '{}'. It should be one of {}".format(
                        emotion, available_emotions
                    )
                )

        self.data_root = data_root
        self.speakers = speakers
        self.emotions = emotions
        if labelmap is None:
            labelmap = {}
            for idx, speaker in enumerate(speakers):
                labelmap[speaker] = idx
        self.labelmap = labelmap
        self.labels = None
        self.max_files = max_files

[docs]    def collect_files(self):
        """Collect wav files for specific speakers.

        Returns:
            list: List of collected wav files.
        """
        paths = []
        labels = []

        if self.max_files is None:
            max_files_per_dir = None
        else:
            max_files_per_dir = (
                self.max_files // len(self.emotions) // len(self.speakers)
            )
        for speaker in self.speakers:
            dirs = list(
                map(lambda x: join(self.data_root, _get_dir(speaker, x)), self.emotions)
            )
            files = []
            for d in dirs:
                if not isdir(d):
                    raise RuntimeError("{} doesn't exist.".format(d))

                fs = [join(d, f) for f in listdir(d)]
                fs = list(filter(lambda x: splitext(x)[1] == ".wav", fs))
                fs = sorted(fs)
                fs = fs[:max_files_per_dir]
                files.extend(fs)

            for f in files:
                paths.append(f)
                labels.append(self.labelmap[speaker])

        self.labels = np.array(labels, dtype=np.int32)
        return paths


# For compat, remove this after v0.1.0
VoiceStatisticsWavFileDataSource = WavFileDataSource