Source code for nnmnkwii.datasets

from collections import OrderedDict
from warnings import warn

import numpy as np
from tqdm import tqdm


[docs]class FileDataSource(object):
    """File data source interface.

    Users are expected to implement custum data source for your own data.
    All file data sources must implement this interface.
    """

[docs]    def collect_files(self):
        """Collect data source files

        Returns:
            List or tuple of list: List of files, or tuple of list if you need
            multiple files to collect features.
        """
        raise NotImplementedError

[docs]    def collect_features(self, *args):
        """Collect features given path(s).

        Args:
            args: File path or tuple of file paths

        Returns:
            2darray: ``T x D`` features represented by 2d array.
        """
        raise NotImplementedError


[docs]class Dataset(object):
    """Dataset represents a fixed-sized set of features composed of multiple
    utterances.
    """

    def __getitem__(self, idx):
        """Get access to the dataset.

        Args:
            idx : index

        Returns:
            features
        """
        raise NotImplementedError

    def __len__(self):
        """Length of the dataset

        Returns:
            int: length of dataset. Can be number of utterances or number of
            total frames depends on implementation.
        """
        raise NotImplementedError


[docs]class FileSourceDataset(Dataset):
    """FileSourceDataset

    Most basic dataset implementation. It supports utterance-wise iteration and
    has utility (:obj:`asarray` method) to convert dataset to an three
    dimentional :obj:`numpy.ndarray`.

    Speech features have typically different number of time resolusion,
    so we cannot simply represent dataset as an
    array. To address the issue, the dataset class represents set
    of features as ``N x T^max x D`` array by padding zeros where ``N`` is the
    number of utterances, ``T^max`` is maximum number of frame lenghs and ``D``
    is the dimention of features, respectively.

    While this dataset loads features on-demand while indexing, if you are
    dealing with relatively small dataset, it might be useful to convert it to
    an array, and then do whatever with numpy/scipy functionalities.

    Attributes:
        file_data_source (FileDataSource): Data source to specify 1) where to
            find data to be loaded and 2) how to collect features from them.
        collected_files (ndarray): Collected files are stored.

    Args:
        file_data_source (FileDataSource): File data source.

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (578, 425) (578, 187)
        (675, 425) (675, 187)
        (606, 425) (606, 187)
        >>> X.asarray(1000).shape
        (3, 1000, 425)
        >>> Y.asarray(1000).shape
        (3, 1000, 187)

    """

    def __init__(self, file_data_source):
        self.file_data_source = file_data_source
        collected_files = self.file_data_source.collect_files()

        # Multiple files
        if isinstance(collected_files, tuple):
            collected_files = np.asarray(collected_files).T
            lengths = np.array([len(files) for files in collected_files])
            if not (lengths == lengths[0]).all():
                raise RuntimeError(
                    """Mismatch of number of collected files {}.
You must collect same number of files when you collect multiple pair of files.""".format(
                        tuple(lengths)
                    )
                )
        else:
            collected_files = np.atleast_2d(collected_files).T
        if len(collected_files) == 0:
            warn("No files are collected. You might have specified wrong data source.")

        self.collected_files = collected_files

    def __collect_features(self, paths):
        try:
            return self.file_data_source.collect_features(*paths)
        except TypeError as e:
            warn(
                "TypeError while iterating dataset.\n"
                + "Likely there's mismatch in number of pair of collected files and "
                + "expected number of arguments of `collect_features`.\n"
                + "Number of argments: {}\n".format(len(paths))
                + "Arguments: {}".format(*paths)
            )
            raise e

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            current, stop, step = idx.indices(len(self))
            return [self[i] for i in range(current, stop, step)]

        paths = self.collected_files[idx]
        return self.__collect_features(paths)

    def __len__(self):
        return len(self.collected_files)

[docs]    def asarray(
        self, padded_length=None, dtype=np.float32, padded_length_guess=1000, verbose=0
    ):
        """Convert dataset to numpy array.

        This try to load entire dataset into a single 3d numpy array.

        Args:
            padded_length (int): Number of maximum time frames to be expected.
              If None, it is set to actual maximum time length.
            dtype (numpy.dtype): Numpy dtype.
            padded_length_guess: (int): Initial guess of max time length of
              padded dataset array. Used if ``padded_length`` is None.
        Returns:
            3d-array: Array of shape ``N x T^max x D`` if ``padded_length`` is
            None, otherwise ``N x padded_length x D``.
        """
        collected_files = self.collected_files
        if padded_length is not None:
            T = padded_length
        else:
            T = padded_length_guess  # initial guess

        D = self[0].shape[-1]
        N = len(self)
        X = np.zeros((N, T, D), dtype=dtype)
        lengths = np.zeros(N, dtype=int)

        if verbose > 0:

            def custom_range(x):
                return tqdm(range(x))

        else:
            custom_range = range

        for idx in custom_range(len(collected_files)):
            paths = collected_files[idx]
            x = self.__collect_features(paths)
            lengths[idx] = len(x)
            if len(x) > T:
                if padded_length is not None:
                    raise RuntimeError(
                        """Num frames {} exceeded: {}.
Try larger value for padded_length, or set to None""".format(
                            len(x), T
                        )
                    )
                warn(
                    f"Reallocating array because num frames {len(x)}"
                    " exceededcurrent guess {T}.\n"
                    "To avoid memory re-allocations, try large `padded_length_guess` "
                    "or set `padded_length` explicitly."
                )
                n = len(x) - T
                # Padd zeros to end of time axis
                X = np.pad(
                    X, [(0, 0), (0, n), (0, 0)], mode="constant", constant_values=0
                )
                T = X.shape[1]
            X[idx][: len(x), :] = x
            lengths[idx] = len(x)

        if padded_length is None:
            max_len = np.max(lengths)
            X = X[:, :max_len, :]
        return X


[docs]class PaddedFileSourceDataset(FileSourceDataset):
    """PaddedFileSourceDataset

    Basic dataset with padding. Very similar to :obj:`FileSourceDataset`,
    it supports utterance-wise iteration and has
    utility (:obj:`asarray` method) to convert dataset to an three
    dimentional :obj:`numpy.ndarray`.

    The difference between :obj:`FileSourceDataset` is that this returns
    padded features as ``T^max x D`` array at ``__getitem__``, while
    :obj:`FileSourceDataset` returns not-padded ``T x D`` array.

    Args:
        file_data_source (FileDataSource): File data source.
        padded_length (int): Padded length.

    Attributes:
        file_data_source (FileDataSource)
        padded_length (int)

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import PaddedFileSourceDataset
        >>> X.asarray(1000).shape
        (3, 1000, 425)
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = PaddedFileSourceDataset(X, 1000), PaddedFileSourceDataset(Y, 1000)
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (1000, 425) (1000, 187)
        (1000, 425) (1000, 187)
        (1000, 425) (1000, 187)
        >>> X.asarray().shape
        (3, 1000, 425)
        >>> Y.asarray().shape
        (3, 1000, 187)
    """

    def __init__(self, file_data_source, padded_length):
        super(PaddedFileSourceDataset, self).__init__(file_data_source)
        self.padded_length = padded_length

    def _getitem_one_sample(self, idx):
        x = super(PaddedFileSourceDataset, self).__getitem__(idx)
        if len(x) > self.padded_length:
            raise RuntimeError(
                """
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
                    len(x), self.padded_length
                )
            )
        return np.pad(
            x,
            [(0, self.padded_length - len(x)), (0, 0)],
            mode="constant",
            constant_values=0,
        )

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            current, stop, step = idx.indices(len(self))
            xs = [self._getitem_one_sample(i) for i in range(current, stop, step)]
            return np.array(xs)
        else:
            return self._getitem_one_sample(idx)

[docs]    def asarray(self, dtype=np.float32, verbose=0):
        return super(PaddedFileSourceDataset, self).asarray(
            self.padded_length, dtype=dtype, verbose=verbose
        )


[docs]class MemoryCacheDataset(Dataset):
    """MemoryCacheDataset

    A thin dataset wrapper class that has simple cache functionality. It supports
    utterance-wise iteration.

    Args:
        dataset (Dataset): Dataset implementation to wrap.
        cache_size (int): Cache size (utterance unit).

    Attributes:
        dataset (Dataset): Dataset
        cached_utterances (OrderedDict): Loaded utterances. Keys are utterance
          indices and values are numpy arrays.
        cache_size (int): Cache size.

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> from nnmnkwii.datasets import MemoryCacheDataset
        >>> X, Y = MemoryCacheDataset(X), MemoryCacheDataset(Y)
        >>> X.cached_utterances
        OrderedDict()
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (578, 425) (578, 187)
        (675, 425) (675, 187)
        (606, 425) (606, 187)
        >>> len(X.cached_utterances)
        3
    """

    def __init__(self, dataset, cache_size=777):
        self.dataset = dataset
        self.cached_utterances = OrderedDict()
        self.cache_size = cache_size

    def __getitem__(self, utt_idx):
        if utt_idx not in self.cached_utterances.keys():
            # Load data from file
            self.cached_utterances[utt_idx] = self.dataset[utt_idx]
        if len(self.cached_utterances) > self.cache_size:
            self.cached_utterances.popitem(last=False)

        return self.cached_utterances[utt_idx]

    def __len__(self):
        return len(self.dataset)


[docs]class MemoryCacheFramewiseDataset(MemoryCacheDataset):
    """MemoryCacheFramewiseDataset

    A thin dataset wrapper class that has simple cache functionality. It supports
    frame-wise iteration. Different from other utterance-wise datasets, you will
    need to explicitly give number of time frames for each utterance at
    construction, since the class has to know the size of dataset to implement
    ``__len__``.

    Note:
        If you are doing random access to the dataset, please be careful that you
        give sufficient large number of cache size, to avoid many file re-loading.

    Args:
        dataset (Dataset): Dataset implementation to wrap.
        lengths (list): Frame lengths for each utterance.
        cache_size (int): Cache size (utterance unit).

    Attributes:
        dataset (Dataset): Dataset
        cached_utterances (OrderedDict): Loaded utterances.
        cache_size (int): Cache size.

    Examples
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> from nnmnkwii.datasets import MemoryCacheFramewiseDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> len(X)
        3
        >>> lengths = [len(x) for x in X] # collect frame lengths
        >>> X = MemoryCacheFramewiseDataset(X, lengths)
        >>> Y = MemoryCacheFramewiseDataset(Y, lengths)
        >>> len(X)
        1859
        >>> x[0].shape
        (425,)
        >>> y[0].shape
        (187,)
    """

    def __init__(self, dataset, lengths, cache_size=777):
        super(MemoryCacheFramewiseDataset, self).__init__(dataset, cache_size)
        self.lengths = lengths
        self.cumsum_lengths = np.hstack((0, np.cumsum(lengths)))
        self.n_frames = np.sum(lengths)
        assert hasattr(self, "dataset")
        assert hasattr(self, "cached_utterances")
        assert hasattr(self, "cache_size")

    def _getitem_one_sample(self, frame_idx):
        # 0-origin
        utt_idx = np.argmax(self.cumsum_lengths > frame_idx) - 1
        frames = super(MemoryCacheFramewiseDataset, self).__getitem__(utt_idx)
        frame_idx_in_focused_utterance = frame_idx - self.cumsum_lengths[utt_idx]
        return frames[frame_idx_in_focused_utterance]

    def __getitem__(self, frame_idx):
        if isinstance(frame_idx, slice):
            current, stop, step = frame_idx.indices(len(self))
            xs = [self._getitem_one_sample(i) for i in range(current, stop, step)]
            return np.array(xs)
        else:
            return self._getitem_one_sample(frame_idx)

    def __len__(self):
        return self.n_frames