from collections import OrderedDict
from warnings import warn
import numpy as np
from tqdm import tqdm
[docs]class FileDataSource(object):
"""File data source interface.
Users are expected to implement custum data source for your own data.
All file data sources must implement this interface.
"""
[docs] def collect_files(self):
"""Collect data source files
Returns:
List or tuple of list: List of files, or tuple of list if you need
multiple files to collect features.
"""
raise NotImplementedError
[docs] def collect_features(self, *args):
"""Collect features given path(s).
Args:
args: File path or tuple of file paths
Returns:
2darray: ``T x D`` features represented by 2d array.
"""
raise NotImplementedError
[docs]class Dataset(object):
"""Dataset represents a fixed-sized set of features composed of multiple
utterances.
"""
def __getitem__(self, idx):
"""Get access to the dataset.
Args:
idx : index
Returns:
features
"""
raise NotImplementedError
def __len__(self):
"""Length of the dataset
Returns:
int: length of dataset. Can be number of utterances or number of
total frames depends on implementation.
"""
raise NotImplementedError
[docs]class FileSourceDataset(Dataset):
"""FileSourceDataset
Most basic dataset implementation. It supports utterance-wise iteration and
has utility (:obj:`asarray` method) to convert dataset to an three
dimentional :obj:`numpy.ndarray`.
Speech features have typically different number of time resolusion,
so we cannot simply represent dataset as an
array. To address the issue, the dataset class represents set
of features as ``N x T^max x D`` array by padding zeros where ``N`` is the
number of utterances, ``T^max`` is maximum number of frame lenghs and ``D``
is the dimention of features, respectively.
While this dataset loads features on-demand while indexing, if you are
dealing with relatively small dataset, it might be useful to convert it to
an array, and then do whatever with numpy/scipy functionalities.
Attributes:
file_data_source (FileDataSource): Data source to specify 1) where to
find data to be loaded and 2) how to collect features from them.
collected_files (ndarray): Collected files are stored.
Args:
file_data_source (FileDataSource): File data source.
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(578, 425) (578, 187)
(675, 425) (675, 187)
(606, 425) (606, 187)
>>> X.asarray(1000).shape
(3, 1000, 425)
>>> Y.asarray(1000).shape
(3, 1000, 187)
"""
def __init__(self, file_data_source):
self.file_data_source = file_data_source
collected_files = self.file_data_source.collect_files()
# Multiple files
if isinstance(collected_files, tuple):
collected_files = np.asarray(collected_files).T
lengths = np.array([len(files) for files in collected_files])
if not (lengths == lengths[0]).all():
raise RuntimeError(
"""Mismatch of number of collected files {}.
You must collect same number of files when you collect multiple pair of files.""".format(
tuple(lengths)
)
)
else:
collected_files = np.atleast_2d(collected_files).T
if len(collected_files) == 0:
warn("No files are collected. You might have specified wrong data source.")
self.collected_files = collected_files
def __collect_features(self, paths):
try:
return self.file_data_source.collect_features(*paths)
except TypeError as e:
warn(
"TypeError while iterating dataset.\n"
+ "Likely there's mismatch in number of pair of collected files and "
+ "expected number of arguments of `collect_features`.\n"
+ "Number of argments: {}\n".format(len(paths))
+ "Arguments: {}".format(*paths)
)
raise e
def __getitem__(self, idx):
if isinstance(idx, slice):
current, stop, step = idx.indices(len(self))
return [self[i] for i in range(current, stop, step)]
paths = self.collected_files[idx]
return self.__collect_features(paths)
def __len__(self):
return len(self.collected_files)
[docs] def asarray(
self, padded_length=None, dtype=np.float32, padded_length_guess=1000, verbose=0
):
"""Convert dataset to numpy array.
This try to load entire dataset into a single 3d numpy array.
Args:
padded_length (int): Number of maximum time frames to be expected.
If None, it is set to actual maximum time length.
dtype (numpy.dtype): Numpy dtype.
padded_length_guess: (int): Initial guess of max time length of
padded dataset array. Used if ``padded_length`` is None.
Returns:
3d-array: Array of shape ``N x T^max x D`` if ``padded_length`` is
None, otherwise ``N x padded_length x D``.
"""
collected_files = self.collected_files
if padded_length is not None:
T = padded_length
else:
T = padded_length_guess # initial guess
D = self[0].shape[-1]
N = len(self)
X = np.zeros((N, T, D), dtype=dtype)
lengths = np.zeros(N, dtype=int)
if verbose > 0:
def custom_range(x):
return tqdm(range(x))
else:
custom_range = range
for idx in custom_range(len(collected_files)):
paths = collected_files[idx]
x = self.__collect_features(paths)
lengths[idx] = len(x)
if len(x) > T:
if padded_length is not None:
raise RuntimeError(
"""Num frames {} exceeded: {}.
Try larger value for padded_length, or set to None""".format(
len(x), T
)
)
warn(
f"Reallocating array because num frames {len(x)}"
" exceededcurrent guess {T}.\n"
"To avoid memory re-allocations, try large `padded_length_guess` "
"or set `padded_length` explicitly."
)
n = len(x) - T
# Padd zeros to end of time axis
X = np.pad(
X, [(0, 0), (0, n), (0, 0)], mode="constant", constant_values=0
)
T = X.shape[1]
X[idx][: len(x), :] = x
lengths[idx] = len(x)
if padded_length is None:
max_len = np.max(lengths)
X = X[:, :max_len, :]
return X
[docs]class PaddedFileSourceDataset(FileSourceDataset):
"""PaddedFileSourceDataset
Basic dataset with padding. Very similar to :obj:`FileSourceDataset`,
it supports utterance-wise iteration and has
utility (:obj:`asarray` method) to convert dataset to an three
dimentional :obj:`numpy.ndarray`.
The difference between :obj:`FileSourceDataset` is that this returns
padded features as ``T^max x D`` array at ``__getitem__``, while
:obj:`FileSourceDataset` returns not-padded ``T x D`` array.
Args:
file_data_source (FileDataSource): File data source.
padded_length (int): Padded length.
Attributes:
file_data_source (FileDataSource)
padded_length (int)
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import PaddedFileSourceDataset
>>> X.asarray(1000).shape
(3, 1000, 425)
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = PaddedFileSourceDataset(X, 1000), PaddedFileSourceDataset(Y, 1000)
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(1000, 425) (1000, 187)
(1000, 425) (1000, 187)
(1000, 425) (1000, 187)
>>> X.asarray().shape
(3, 1000, 425)
>>> Y.asarray().shape
(3, 1000, 187)
"""
def __init__(self, file_data_source, padded_length):
super(PaddedFileSourceDataset, self).__init__(file_data_source)
self.padded_length = padded_length
def _getitem_one_sample(self, idx):
x = super(PaddedFileSourceDataset, self).__getitem__(idx)
if len(x) > self.padded_length:
raise RuntimeError(
"""
Num frames {} exceeded: {}. Try larger value for padded_length.""".format(
len(x), self.padded_length
)
)
return np.pad(
x,
[(0, self.padded_length - len(x)), (0, 0)],
mode="constant",
constant_values=0,
)
def __getitem__(self, idx):
if isinstance(idx, slice):
current, stop, step = idx.indices(len(self))
xs = [self._getitem_one_sample(i) for i in range(current, stop, step)]
return np.array(xs)
else:
return self._getitem_one_sample(idx)
[docs] def asarray(self, dtype=np.float32, verbose=0):
return super(PaddedFileSourceDataset, self).asarray(
self.padded_length, dtype=dtype, verbose=verbose
)
[docs]class MemoryCacheDataset(Dataset):
"""MemoryCacheDataset
A thin dataset wrapper class that has simple cache functionality. It supports
utterance-wise iteration.
Args:
dataset (Dataset): Dataset implementation to wrap.
cache_size (int): Cache size (utterance unit).
Attributes:
dataset (Dataset): Dataset
cached_utterances (OrderedDict): Loaded utterances. Keys are utterance
indices and values are numpy arrays.
cache_size (int): Cache size.
Examples:
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> from nnmnkwii.datasets import MemoryCacheDataset
>>> X, Y = MemoryCacheDataset(X), MemoryCacheDataset(Y)
>>> X.cached_utterances
OrderedDict()
>>> for (x, y) in zip(X, Y):
... print(x.shape, y.shape)
...
(578, 425) (578, 187)
(675, 425) (675, 187)
(606, 425) (606, 187)
>>> len(X.cached_utterances)
3
"""
def __init__(self, dataset, cache_size=777):
self.dataset = dataset
self.cached_utterances = OrderedDict()
self.cache_size = cache_size
def __getitem__(self, utt_idx):
if utt_idx not in self.cached_utterances.keys():
# Load data from file
self.cached_utterances[utt_idx] = self.dataset[utt_idx]
if len(self.cached_utterances) > self.cache_size:
self.cached_utterances.popitem(last=False)
return self.cached_utterances[utt_idx]
def __len__(self):
return len(self.dataset)
[docs]class MemoryCacheFramewiseDataset(MemoryCacheDataset):
"""MemoryCacheFramewiseDataset
A thin dataset wrapper class that has simple cache functionality. It supports
frame-wise iteration. Different from other utterance-wise datasets, you will
need to explicitly give number of time frames for each utterance at
construction, since the class has to know the size of dataset to implement
``__len__``.
Note:
If you are doing random access to the dataset, please be careful that you
give sufficient large number of cache size, to avoid many file re-loading.
Args:
dataset (Dataset): Dataset implementation to wrap.
lengths (list): Frame lengths for each utterance.
cache_size (int): Cache size (utterance unit).
Attributes:
dataset (Dataset): Dataset
cached_utterances (OrderedDict): Loaded utterances.
cache_size (int): Cache size.
Examples
>>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
>>> from nnmnkwii.datasets import FileSourceDataset
>>> from nnmnkwii.datasets import MemoryCacheFramewiseDataset
>>> X, Y = example_file_data_sources_for_acoustic_model()
>>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
>>> len(X)
3
>>> lengths = [len(x) for x in X] # collect frame lengths
>>> X = MemoryCacheFramewiseDataset(X, lengths)
>>> Y = MemoryCacheFramewiseDataset(Y, lengths)
>>> len(X)
1859
>>> x[0].shape
(425,)
>>> y[0].shape
(187,)
"""
def __init__(self, dataset, lengths, cache_size=777):
super(MemoryCacheFramewiseDataset, self).__init__(dataset, cache_size)
self.lengths = lengths
self.cumsum_lengths = np.hstack((0, np.cumsum(lengths)))
self.n_frames = np.sum(lengths)
assert hasattr(self, "dataset")
assert hasattr(self, "cached_utterances")
assert hasattr(self, "cache_size")
def _getitem_one_sample(self, frame_idx):
# 0-origin
utt_idx = np.argmax(self.cumsum_lengths > frame_idx) - 1
frames = super(MemoryCacheFramewiseDataset, self).__getitem__(utt_idx)
frame_idx_in_focused_utterance = frame_idx - self.cumsum_lengths[utt_idx]
return frames[frame_idx_in_focused_utterance]
def __getitem__(self, frame_idx):
if isinstance(frame_idx, slice):
current, stop, step = frame_idx.indices(len(self))
xs = [self._getitem_one_sample(i) for i in range(current, stop, step)]
return np.array(xs)
else:
return self._getitem_one_sample(frame_idx)
def __len__(self):
return self.n_frames