Source code for nnmnkwii.datasets

from __future__ import with_statement, print_function, absolute_import

import numpy as np

from collections import OrderedDict


[docs]class FileDataSource(object):
    """File data source interface.

    Users are expected to implement custum data source for your own data.
    All file data sources must implement this interface.
    """

[docs]    def collect_files(self):
        """Collect data source files

        Returns:
            List or tuple of list: List of files, or tuple of list if you need
            multiple files to collect features.
        """
        raise NotImplementedError

[docs]    def collect_features(self, *args):
        """Collect features given path(s).

        Args:
            args: File path or tuple of file paths

        Returns:
            2darray: ``T x D`` features represented by 2d array.
        """
        raise NotImplementedError


[docs]class Dataset(object):
    """Dataset represents a fixed-sized set of features composed of multiple
    utterances.
    """

    def __getitem__(self, idx):
        """Get access to the dataset.

        Args:
            idx : index

        Returns:
            features
        """
        raise NotImplementedError

    def __len__(self):
        """Length of the dataset

        Returns:
            int: length of dataset. Can be number of utterances or number of
            total frames depends on implementation.
        """
        raise NotImplementedError


[docs]class FileSourceDataset(Dataset):
    """FileSourceDataset

    Most basic dataset implementation. It supports utterance-wise iteration and
    has utility (:obj:`asarray` method) to convert dataset to an three
    dimentional :obj:`numpy.ndarray`.

    Speech features have typically different number of time resolusion,
    so we cannot simply represent dataset as an
    array. To address the issue, the dataset class represents set
    of features as ``N x T^max x D`` array by padding zeros where ``N`` is the
    number of utterances, ``T^max`` is maximum number of frame lenghs and ``D``
    is the dimention of features, respectively.

    While this dataset loads features on-demand while indexing, if you are
    dealing with relatively small dataset, it might be useful to convert it to
    an array, and then do whatever with numpy/scipy functionalities.

    Attributes:
        file_data_source (FileDataSource): Data source to specify 1) where to
            find data to be loaded and 2) how to collect features from them.
        collected_files (ndarray): Collected files are stored.

    Args:
        file_data_source (FileDataSource): File data source.

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (578, 425) (578, 187)
        (675, 425) (675, 187)
        (606, 425) (606, 187)
        >>> X.asarray(1000).shape
        (3, 1000, 425)
        >>> Y.asarray(1000).shape
        (3, 1000, 187)

    """

    def __init__(self,
                 file_data_source):
        self.file_data_source = file_data_source
        collected_files = self.file_data_source.collect_files()
        if isinstance(collected_files, tuple):
            collected_files = np.asarray(collected_files).T
        else:
            collected_files = np.atleast_2d(collected_files).T
        self.collected_files = collected_files

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            current, stop, step = idx.indices(len(self))
            return [self[i] for i in range(current, stop, step)]
        return self.file_data_source.collect_features(*self.collected_files[idx])

    def __len__(self):
        return len(self.collected_files)

[docs]    def asarray(self, padded_length, dtype=np.float32):
        """Convert dataset to numpy array.

        This try to load entire dataset into a single 3d numpy array.

        Args:
            padded_length (int): Number of maximum time frames to be expected.
        Returns:
            3d-array: ``N x T^max x D`` array
        """
        collected_files = self.collected_files
        T = padded_length

        D = self[0].shape[-1]
        N = len(self)
        X = np.zeros((N, T, D), dtype=dtype)

        lengths = np.zeros(N, dtype=np.int)
        for idx, paths in enumerate(collected_files):
            x = self.file_data_source.collect_features(*paths)
            if len(x) > T:
                raise RuntimeError("""
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
                    len(x), T))
                # TODO: segmentation algorithm?
            X[idx][:len(x), :] = x
            lengths[idx] = len(x)
        return X


[docs]class PaddedFileSourceDataset(FileSourceDataset):
    """PaddedFileSourceDataset

    Basic dataset with padding. Very similar to :obj:`FileSourceDataset`,
    it supports utterance-wise iteration and has
    utility (:obj:`asarray` method) to convert dataset to an three
    dimentional :obj:`numpy.ndarray`.

    The difference between :obj:`FileSourceDataset` is that this returns
    padded features as ``T^max x D`` array at ``__getitem__``, while
    :obj:`FileSourceDataset` returns not-padded ``T x D`` array.

    Args:
        file_data_source (FileDataSource): File data source.
        padded_length (int): Padded length.

    Attributes:
        file_data_source (FileDataSource)
        padded_length (int)

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import PaddedFileSourceDataset
        >>> X.asarray(1000).shape
        (3, 1000, 425)
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = PaddedFileSourceDataset(X, 1000), PaddedFileSourceDataset(Y, 1000)
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (1000, 425) (1000, 187)
        (1000, 425) (1000, 187)
        (1000, 425) (1000, 187)
        >>> X.asarray().shape
        (3, 1000, 425)
        >>> Y.asarray().shape
        (3, 1000, 187)
    """

    def __init__(self, file_data_source, padded_length):
        super(PaddedFileSourceDataset, self).__init__(file_data_source)
        self.padded_length = padded_length

    def _getitem_one_sample(self, idx):
        x = super(PaddedFileSourceDataset, self).__getitem__(idx)
        if len(x) > self.padded_length:
            raise RuntimeError("""
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
                len(x), self.padded_length))
        return np.pad(x, [(0, self.padded_length - len(x)), (0, 0)],
                      mode="constant", constant_values=0)

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            current, stop, step = idx.indices(len(self))
            xs = [self._getitem_one_sample(i)
                  for i in range(current, stop, step)]
            return np.array(xs)
        else:
            return self._getitem_one_sample(idx)

    def asarray(self):
        return super(PaddedFileSourceDataset, self).asarray(self.padded_length)


[docs]class MemoryCacheDataset(Dataset):
    """MemoryCacheDataset

    A thin dataset wrapper class that has simple cache functionality. It supports
    utterance-wise iteration.

    Args:
        dataset (Dataset): Dataset implementation to wrap.
        cache_size (int): Cache size (utterance unit).

    Attributes:
        dataset (Dataset): Dataset
        cached_utterances (OrderedDict): Loaded utterances. Keys are utterance
          indices and values are numpy arrays.
        cache_size (int): Cache size.

    Examples:
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> from nnmnkwii.datasets import MemoryCacheDataset
        >>> X, Y = MemoryCacheDataset(X), MemoryCacheDataset(Y)
        >>> X.cached_utterances
        OrderedDict()
        >>> for (x, y) in zip(X, Y):
        ...     print(x.shape, y.shape)
        ...
        (578, 425) (578, 187)
        (675, 425) (675, 187)
        (606, 425) (606, 187)
        >>> len(X.cached_utterances)
        3
    """

    def __init__(self, dataset, cache_size=777):
        self.dataset = dataset
        self.cached_utterances = OrderedDict()
        self.cache_size = cache_size

    def __getitem__(self, utt_idx):
        if utt_idx not in self.cached_utterances.keys():
            # Load data from file
            self.cached_utterances[utt_idx] = self.dataset[utt_idx]
        if len(self.cached_utterances) > self.cache_size:
            self.cached_utterances.popitem(last=False)

        return self.cached_utterances[utt_idx]

    def __len__(self):
        return len(self.dataset)


[docs]class MemoryCacheFramewiseDataset(MemoryCacheDataset):
    """MemoryCacheFramewiseDataset

    A thin dataset wrapper class that has simple cache functionality. It supports
    frame-wise iteration. Different from other utterance-wise datasets, you will
    need to explicitly give number of time frames for each utterance at
    construction, since the class has to know the size of dataset to implement
    ``__len__``.

    Note:
        If you are doing random access to the dataset, please be careful that you
        give sufficient large number of cache size, to avoid many file re-loading.

    Args:
        dataset (Dataset): Dataset implementation to wrap.
        lengths (list): Frame lengths for each utterance.
        cache_size (int): Cache size (utterance unit).

    Attributes:
        dataset (Dataset): Dataset
        cached_utterances (OrderedDict): Loaded utterances.
        cache_size (int): Cache size.

    Examples
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> from nnmnkwii.datasets import MemoryCacheFramewiseDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> len(X)
        3
        >>> lengths = [len(x) for x in X] # collect frame lengths
        >>> X = MemoryCacheFramewiseDataset(X, lengths)
        >>> Y = MemoryCacheFramewiseDataset(Y, lengths)
        >>> len(X)
        1859
        >>> x[0].shape
        (425,)
        >>> y[0].shape
        (187,)
    """

    def __init__(self, dataset, lengths, cache_size=777):
        super(MemoryCacheFramewiseDataset, self).__init__(dataset, cache_size)
        self.lengths = lengths
        self.cumsum_lengths = np.hstack((0, np.cumsum(lengths)))
        self.n_frames = np.sum(lengths)
        assert hasattr(self, "dataset")
        assert hasattr(self, "cached_utterances")
        assert hasattr(self, "cache_size")

    def _getitem_one_sample(self, frame_idx):
        # 0-origin
        utt_idx = np.argmax(self.cumsum_lengths > frame_idx) - 1
        frames = super(MemoryCacheFramewiseDataset, self).__getitem__(utt_idx)
        frame_idx_in_focused_utterance = frame_idx - \
            self.cumsum_lengths[utt_idx]
        return frames[frame_idx_in_focused_utterance]

    def __getitem__(self, frame_idx):
        if isinstance(frame_idx, slice):
            current, stop, step = frame_idx.indices(len(self))
            xs = [self._getitem_one_sample(i)
                  for i in range(current, stop, step)]
            return np.array(xs)
        else:
            return self._getitem_one_sample(frame_idx)

    def __len__(self):
        return self.n_frames