from __future__ import with_statement, print_function, absolute_import
import numpy as np
from collections import OrderedDict
[docs]class FileDataSource(object):
"""File data source interface.
Users are expected to implement custum data source for your own data.
All file data sources must implement this interface.
"""
[docs] def collect_files(self):
"""Collect data source files
Returns:
List or tuple of list: List of files, or tuple of list if you need
multiple files to collect features.
"""
raise NotImplementedError
[docs] def collect_features(self, *args):
"""Collect features given path(s).
Args:
args: File path or tuple of file paths
Returns:
2darray: ``T x D`` features represented by 2d array.
"""
raise NotImplementedError
[docs]class Dataset(object):
"""Dataset represents a fixed-sized set of features composed of multiple
utterances.
"""
def __getitem__(self, idx):
"""Get access to the dataset.
Args:
idx : index
Returns:
features
"""
raise NotImplementedError
def __len__(self):
"""Length of the dataset
Returns:
int: length of dataset. Can be number of utterances or number of
total frames depends on implementation.
"""
raise NotImplementedError
[docs]class FileSourceDataset(Dataset):
"""FileSourceDataset
Most basic dataset implementation. It supports utterance-wise iteration and
has utility (:obj:`asarray` method) to convert dataset to an three
dimentional :obj:`numpy.ndarray`.
Speech features have typically different number of time resolusion,
so we cannot simply represent dataset as an
array. To address the issue, the dataset class represents set
of features as ``N x T^max x D`` array by padding zeros where ``N`` is the
number of utterances, ``T^max`` is maximum number of frame lenghs and ``D``
is the dimention of features, respectively.
While this dataset loads features on-demand while indexing, if you are
dealing with relatively small dataset, it might be useful to convert it to
an array, and then do whatever with numpy/scipy functionalities.
Attributes:
file_data_source (FileDataSource): Data source to specify 1) where to
find data to be loaded and 2) how to collect features from them.
collected_files (ndarray): Collected files are stored.
Args:
file_data_source (FileDataSource): File data source.
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(578, 425) (578, 187)
(675, 425) (675, 187)
(606, 425) (606, 187)
>>> X.asarray(1000).shape
(3, 1000, 425)
>>> Y.asarray(1000).shape
(3, 1000, 187)
"""
def __init__(self,
file_data_source):
self.file_data_source = file_data_source
collected_files = self.file_data_source.collect_files()
if isinstance(collected_files, tuple):
collected_files = np.asarray(collected_files).T
else:
collected_files = np.atleast_2d(collected_files).T
self.collected_files = collected_files
def __getitem__(self, idx):
if isinstance(idx, slice):
current, stop, step = idx.indices(len(self))
return [self[i] for i in range(current, stop, step)]
return self.file_data_source.collect_features(*self.collected_files[idx])
def __len__(self):
return len(self.collected_files)
[docs] def asarray(self, padded_length, dtype=np.float32):
"""Convert dataset to numpy array.
This try to load entire dataset into a single 3d numpy array.
Args:
padded_length (int): Number of maximum time frames to be expected.
Returns:
3d-array: ``N x T^max x D`` array
"""
collected_files = self.collected_files
T = padded_length
D = self[0].shape[-1]
N = len(self)
X = np.zeros((N, T, D), dtype=dtype)
lengths = np.zeros(N, dtype=np.int)
for idx, paths in enumerate(collected_files):
x = self.file_data_source.collect_features(*paths)
if len(x) > T:
raise RuntimeError("""
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
len(x), T))
# TODO: segmentation algorithm?
X[idx][:len(x), :] = x
lengths[idx] = len(x)
return X
[docs]class PaddedFileSourceDataset(FileSourceDataset):
"""PaddedFileSourceDataset
Basic dataset with padding. Very similar to :obj:`FileSourceDataset`,
it supports utterance-wise iteration and has
utility (:obj:`asarray` method) to convert dataset to an three
dimentional :obj:`numpy.ndarray`.
The difference between :obj:`FileSourceDataset` is that this returns
padded features as ``T^max x D`` array at ``__getitem__``, while
:obj:`FileSourceDataset` returns not-padded ``T x D`` array.
Args:
file_data_source (FileDataSource): File data source.
padded_length (int): Padded length.
Attributes:
file_data_source (FileDataSource)
padded_length (int)
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import PaddedFileSourceDataset
>>> X.asarray(1000).shape
(3, 1000, 425)
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = PaddedFileSourceDataset(X, 1000), PaddedFileSourceDataset(Y, 1000)
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(1000, 425) (1000, 187)
(1000, 425) (1000, 187)
(1000, 425) (1000, 187)
>>> X.asarray().shape
(3, 1000, 425)
>>> Y.asarray().shape
(3, 1000, 187)
"""
def __init__(self, file_data_source, padded_length):
super(PaddedFileSourceDataset, self).__init__(file_data_source)
self.padded_length = padded_length
def _getitem_one_sample(self, idx):
x = super(PaddedFileSourceDataset, self).__getitem__(idx)
if len(x) > self.padded_length:
raise RuntimeError("""
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
len(x), self.padded_length))
return np.pad(x, [(0, self.padded_length - len(x)), (0, 0)],
mode="constant", constant_values=0)
def __getitem__(self, idx):
if isinstance(idx, slice):
current, stop, step = idx.indices(len(self))
xs = [self._getitem_one_sample(i)
for i in range(current, stop, step)]
return np.array(xs)
else:
return self._getitem_one_sample(idx)
def asarray(self):
return super(PaddedFileSourceDataset, self).asarray(self.padded_length)
[docs]class MemoryCacheDataset(Dataset):
"""MemoryCacheDataset
A thin dataset wrapper class that has simple cache functionality. It supports
utterance-wise iteration.
Args:
dataset (Dataset): Dataset implementation to wrap.
cache_size (int): Cache size (utterance unit).
Attributes:
dataset (Dataset): Dataset
cached_utterances (OrderedDict): Loaded utterances. Keys are utterance
indices and values are numpy arrays.
cache_size (int): Cache size.
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> from nnmnkwii.datasets import MemoryCacheDataset
>>> X, Y = MemoryCacheDataset(X), MemoryCacheDataset(Y)
>>> X.cached_utterances
OrderedDict()
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(578, 425) (578, 187)
(675, 425) (675, 187)
(606, 425) (606, 187)
>>> len(X.cached_utterances)
3
"""
def __init__(self, dataset, cache_size=777):
self.dataset = dataset
self.cached_utterances = OrderedDict()
self.cache_size = cache_size
def __getitem__(self, utt_idx):
if utt_idx not in self.cached_utterances.keys():
# Load data from file
self.cached_utterances[utt_idx] = self.dataset[utt_idx]
if len(self.cached_utterances) > self.cache_size:
self.cached_utterances.popitem(last=False)
return self.cached_utterances[utt_idx]
def __len__(self):
return len(self.dataset)
[docs]class MemoryCacheFramewiseDataset(MemoryCacheDataset):
"""MemoryCacheFramewiseDataset
A thin dataset wrapper class that has simple cache functionality. It supports
frame-wise iteration. Different from other utterance-wise datasets, you will
need to explicitly give number of time frames for each utterance at
construction, since the class has to know the size of dataset to implement
``__len__``.
Note:
If you are doing random access to the dataset, please be careful that you
give sufficient large number of cache size, to avoid many file re-loading.
Args:
dataset (Dataset): Dataset implementation to wrap.
lengths (list): Frame lengths for each utterance.
cache_size (int): Cache size (utterance unit).
Attributes:
dataset (Dataset): Dataset
cached_utterances (OrderedDict): Loaded utterances.
cache_size (int): Cache size.
Examples
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> from nnmnkwii.datasets import MemoryCacheFramewiseDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> len(X)
3
>>> lengths = [len(x) for x in X] # collect frame lengths
>>> X = MemoryCacheFramewiseDataset(X, lengths)
>>> Y = MemoryCacheFramewiseDataset(Y, lengths)
>>> len(X)
1859
>>> x[0].shape
(425,)
>>> y[0].shape
(187,)
"""
def __init__(self, dataset, lengths, cache_size=777):
super(MemoryCacheFramewiseDataset, self).__init__(dataset, cache_size)
self.lengths = lengths
self.cumsum_lengths = np.hstack((0, np.cumsum(lengths)))
self.n_frames = np.sum(lengths)
assert hasattr(self, "dataset")
assert hasattr(self, "cached_utterances")
assert hasattr(self, "cache_size")
def _getitem_one_sample(self, frame_idx):
# 0-origin
utt_idx = np.argmax(self.cumsum_lengths > frame_idx) - 1
frames = super(MemoryCacheFramewiseDataset, self).__getitem__(utt_idx)
frame_idx_in_focused_utterance = frame_idx - \
self.cumsum_lengths[utt_idx]
return frames[frame_idx_in_focused_utterance]
def __getitem__(self, frame_idx):
if isinstance(frame_idx, slice):
current, stop, step = frame_idx.indices(len(self))
xs = [self._getitem_one_sample(i)
for i in range(current, stop, step)]
return np.array(xs)
else:
return self._getitem_one_sample(frame_idx)
def __len__(self):
return self.n_frames