Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitsync
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added data/ccpp_data.npz
Binary file not shown.
174 changes: 131 additions & 43 deletions mlp/data_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,54 @@ def __init__(self, inputs, targets, batch_size, max_num_batches=-1,
"""
self.inputs = inputs
self.targets = targets
self.batch_size = batch_size
assert max_num_batches != 0 and not max_num_batches < -1, (
'max_num_batches should be -1 or > 0')
self.max_num_batches = max_num_batches
if batch_size < 1:
raise ValueError('batch_size must be >= 1')
self._batch_size = batch_size
if max_num_batches == 0 or max_num_batches < -1:
raise ValueError('max_num_batches must be -1 or > 0')
self._max_num_batches = max_num_batches
self._update_num_batches()
self.shuffle_order = shuffle_order
self._current_order = np.arange(inputs.shape[0])
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
self.new_epoch()

@property
def batch_size(self):
"""Number of data points to include in each batch."""
return self._batch_size

@batch_size.setter
def batch_size(self, value):
if value < 1:
raise ValueError('batch_size must be >= 1')
self._batch_size = value
self._update_num_batches()

@property
def max_num_batches(self):
"""Maximum number of batches to iterate over in an epoch."""
return self._max_num_batches

@max_num_batches.setter
def max_num_batches(self, value):
if value == 0 or value < -1:
raise ValueError('max_num_batches must be -1 or > 0')
self._max_num_batches = value
self._update_num_batches()

def _update_num_batches(self):
"""Updates number of batches to iterate over."""
# maximum possible number of batches is equal to number of whole times
# batch_size divides in to the number of data points which can be
# found using integer division
possible_num_batches = self.inputs.shape[0] // batch_size
possible_num_batches = self.inputs.shape[0] // self.batch_size
if self.max_num_batches == -1:
self.num_batches = possible_num_batches
else:
self.num_batches = min(self.max_num_batches, possible_num_batches)
self.shuffle_order = shuffle_order
if rng is None:
rng = np.random.RandomState(DEFAULT_SEED)
self.rng = rng
self.reset()

def __iter__(self):
"""Implements Python iterator interface.
Expand All @@ -63,24 +94,36 @@ def __iter__(self):
"""
return self

def reset(self):
"""Resets the provider to the initial state to use in a new epoch."""
def new_epoch(self):
"""Starts a new epoch (pass through data), possibly shuffling first."""
self._curr_batch = 0
if self.shuffle_order:
self.shuffle()

def __next__(self):
return self.next()

def reset(self):
"""Resets the provider to the initial state."""
inv_perm = np.argsort(self._current_order)
self._current_order = self._current_order[inv_perm]
self.inputs = self.inputs[inv_perm]
self.targets = self.targets[inv_perm]
self.new_epoch()

def shuffle(self):
"""Randomly shuffles order of data."""
new_order = self.rng.permutation(self.inputs.shape[0])
self.inputs = self.inputs[new_order]
self.targets = self.targets[new_order]
perm = self.rng.permutation(self.inputs.shape[0])
self._current_order = self._current_order[perm]
self.inputs = self.inputs[perm]
self.targets = self.targets[perm]

def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
if self._curr_batch + 1 > self.num_batches:
# no more batches in current iteration through data set so reset
# the dataset for another pass and indicate iteration is at end
self.reset()
# no more batches in current iteration through data set so start
# new epoch ready for another pass and indicate iteration is at end
self.new_epoch()
raise StopIteration()
# create an index slice corresponding to current batch number
batch_slice = slice(self._curr_batch * self.batch_size,
Expand All @@ -90,7 +133,6 @@ def next(self):
self._curr_batch += 1
return inputs_batch, targets_batch


class MNISTDataProvider(DataProvider):
"""Data provider for MNIST handwritten digit images."""

Expand Down Expand Up @@ -133,13 +175,10 @@ def __init__(self, which_set='train', batch_size=100, max_num_batches=-1,
super(MNISTDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)

# def next(self):
# """Returns next data batch or raises `StopIteration` if at end."""
# inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
# return inputs_batch, self.to_one_of_k(targets_batch)
#
def __next__(self):
return self.next()
def next(self):
"""Returns next data batch or raises `StopIteration` if at end."""
inputs_batch, targets_batch = super(MNISTDataProvider, self).next()
return inputs_batch, self.to_one_of_k(targets_batch)

def to_one_of_k(self, int_targets):
"""Converts integer coded class target to 1 of K coded targets.
Expand All @@ -156,15 +195,17 @@ def to_one_of_k(self, int_targets):
to zero except for the column corresponding to the correct class
which is equal to one.
"""
raise NotImplementedError()
one_of_k_targets = np.zeros((int_targets.shape[0], self.num_classes))
one_of_k_targets[range(int_targets.shape[0]), int_targets] = 1
return one_of_k_targets


class MetOfficeDataProvider(DataProvider):
"""South Scotland Met Office weather data provider."""

def __init__(self, window_size, batch_size=10, max_num_batches=-1,
shuffle_order=True, rng=None):
"""Create a new Met Offfice data provider object.
"""Create a new Met Office data provider object.

Args:
window_size (int): Size of windows to split weather time series
Expand All @@ -180,27 +221,74 @@ def __init__(self, window_size, batch_size=10, max_num_batches=-1,
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
self.window_size = window_size
assert window_size > 1, 'window_size must be at least 2.'
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'HadSSP_daily_qc.txt')
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# load raw data from text file
# ...
raw = np.loadtxt(data_path, skiprows=3, usecols=range(2, 32))
assert window_size > 1, 'window_size must be at least 2.'
self.window_size = window_size
# filter out all missing datapoints and flatten to a vector
# ...
filtered = raw[raw >= 0].flatten()
# normalise data to zero mean, unit standard deviation
# ...
# convert from flat sequence to windowed data
# ...
mean = np.mean(filtered)
std = np.std(filtered)
normalised = (filtered - mean) / std
# create a view on to array corresponding to a rolling window
shape = (normalised.shape[-1] - self.window_size + 1, self.window_size)
strides = normalised.strides + (normalised.strides[-1],)
windowed = np.lib.stride_tricks.as_strided(
normalised, shape=shape, strides=strides)
# inputs are first (window_size - 1) entries in windows
# inputs = ...
inputs = windowed[:, :-1]
# targets are last entry in windows
# targets = ...
# initialise base class with inputs and targets arrays
# super(MetOfficeDataProvider, self).__init__(
# inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
def __next__(self):
return self.next()
targets = windowed[:, -1]
super(MetOfficeDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)

class CCPPDataProvider(DataProvider):

def __init__(self, which_set='train', input_dims=None, batch_size=10,
max_num_batches=-1, shuffle_order=True, rng=None):
"""Create a new Combined Cycle Power Plant data provider object.

Args:
which_set: One of 'train' or 'valid'. Determines which portion of
data this object should provide.
input_dims: Which of the four input dimension to use. If `None` all
are used. If an iterable of integers are provided (consisting
of a subset of {0, 1, 2, 3}) then only the corresponding
input dimensions are included.
batch_size (int): Number of data points to include in each batch.
max_num_batches (int): Maximum number of batches to iterate over
in an epoch. If `max_num_batches * batch_size > num_data` then
only as many batches as the data can be split into will be
used. If set to -1 all of the data will be used.
shuffle_order (bool): Whether to randomly permute the order of
the data before each epoch.
rng (RandomState): A seeded random number generator.
"""
data_path = os.path.join(
os.environ['MLP_DATA_DIR'], 'ccpp_data.npz')
assert os.path.isfile(data_path), (
'Data file does not exist at expected path: ' + data_path
)
# check a valid which_set was provided
assert which_set in ['train', 'valid'], (
'Expected which_set to be either train or valid '
'Got {0}'.format(which_set)
)
# check input_dims are valid
if not input_dims is not None:
input_dims = set(input_dims)
assert input_dims.issubset({0, 1, 2, 3}), (
'input_dims should be a subset of {0, 1, 2, 3}'
)
loaded = np.load(data_path)
inputs = loaded[which_set + '_inputs']
if input_dims is not None:
inputs = inputs[:, input_dims]
targets = loaded[which_set + '_targets']
super(CCPPDataProvider, self).__init__(
inputs, targets, batch_size, max_num_batches, shuffle_order, rng)
Loading