Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update data_handler.py #14

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 67 additions & 11 deletions data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pandas as pd
import tensorflow as tf
from load_data import load_data, smooth, transform
from sklearn.covariance import GraphicalLasso, EmpiricalCovariance
from sklearn import preprocessing

class DataHandler:
def __init__(
Expand All @@ -12,7 +14,8 @@ def __init__(
window_batch_size=10,
window_shift=1,
splits=[0.70, 0.15, 0.15],
predict_timestamp=3
predict_timestamp=3,
num_per_group=5
):
"""Create a DataHandler which is able to load and manipulate data in different ways."""
#self.is_normalized = False
Expand All @@ -25,6 +28,7 @@ def __init__(
self.window_shift = window_shift
self.window_batch_size = window_batch_size # can find a best from 8 10 16
self.max_num_features = num_features
self.num_per_group = num_per_group
self.predict_timestamp = predict_timestamp
self.clusters = None
self.clusters_func = None
Expand All @@ -33,6 +37,8 @@ def __init__(
self._load_data(config)
self.use_splits(splits)
self.clusters_abund, self.clusters_abund_size = self._make_abundance_clusters()
self.clusters_graph, self.clusters_graph_size = self._make_graph_clusters()
assert self.clusters_abund_size == self.clusters_graph_size

@property
def train(self):
Expand All @@ -44,7 +50,7 @@ def train(self):
@property
def val(self):
if self._train_val_index == self._val_test_index:
return self.test # if no val data, it should be test
return self.test
elif self.clusters is None:
return self._all.iloc[self._train_val_index:self._val_test_index, :self.max_num_features]
else:
Expand Down Expand Up @@ -74,17 +80,18 @@ def all_nontrans(self):
@property
def train_batched(self):
"""Batches of training data."""
return self._make_batched_dataset(self.train, endindex=True)
return self._make_batched_dataset(self._all.iloc[:self._train_val_index+self.predict_timestamp, self.clusters], endindex=True)

@property
def val_batched(self):
"""Batches of validation data."""
return self._make_batched_dataset(self.val, endindex=True)
return self._make_batched_dataset(self._all.iloc[self._train_val_index-self.predict_timestamp:self._val_test_index+self.predict_timestamp,
self.clusters], endindex=True)

@property
def test_batched(self):
"""Batches of test data."""
return self._make_batched_dataset(self.test, endindex=True)
return self._make_batched_dataset(self._all.iloc[self._val_test_index-self.predict_timestamp:, self.clusters], endindex=True)

@property
def all_batched(self):
Expand Down Expand Up @@ -130,13 +137,13 @@ def _make_batched_dataset(self, dataset, endindex):
)

def _make_abundance_clusters(self):
clust = np.zeros(self.clusters_func.shape, dtype=int)
if self.max_num_features is None:
clust = np.zeros(self.max_num_features, dtype=int)
if self.num_per_group is None:
return clust
i = 0
c = 0
while i < (clust.size - self.max_num_features):
for _ in range(self.max_num_features):
while i < (clust.size - self.num_per_group):
for _ in range(self.num_per_group):
clust[i] = c
i += 1
c += 1
Expand All @@ -146,6 +153,49 @@ def _make_abundance_clusters(self):
c += 1
return clust, c

def _make_graph_clusters(self):
clust = np.zeros(self.max_num_features, dtype=int)
if self.num_per_group is None:
return clust
clust = clust - 1
standsacle = preprocessing.StandardScaler()
x = self._all.iloc[:, :]
standsacle.fit(x[:])
graph_train_data = standsacle.transform(x[:], copy=True)
try:
cov_init = GraphicalLasso(alpha=0.0001, mode='cd', max_iter=500, assume_centered=True).fit(graph_train_data)
except Exception as e:
print('EmpiricalCovariance precision_')
cov_init = EmpiricalCovariance(store_precision=True, assume_centered=True).fit(graph_train_data)
adj_mx = np.abs(cov_init.precision_)
d = np.array(adj_mx.sum(1))
d_add = np.diag(d)
d = d * 2
d_inv = np.power(d, -0.5).flatten()
d_inv[np.isinf(d_inv)] = 0.
d_mat_inv = np.diag(d_inv)
self.graph_matrix = d_mat_inv.dot(adj_mx+d_add).dot(d_mat_inv)
graph_matrix = self.graph_matrix.copy()

i = 0
c = 0
while i < self.max_num_features:
if clust[i] != -1:
i += 1
continue
clust[i] = c
temp = graph_matrix[i]
top_number = temp.argsort()
graph_matrix[:, i] = -1
assert i in top_number[-self.num_per_group:]
for j in range(self.num_per_group):
if clust[top_number[-j-1]] == -1:
clust[top_number[-j-1]] = c
graph_matrix[:, top_number[-j-1]] = -1
c += 1
i += 1
return clust, c

def use_cluster(self, number, cluster_type='abund'):
"""Cluster type is which type of cluster to use:
abund: means that the x most abundant are in the first cluster,
Expand All @@ -165,19 +215,21 @@ def use_cluster(self, number, cluster_type='abund'):
elif cluster_type == 'abund':
self.clusters = self.clusters_abund == number
self._only_mark_first_max_num_features()
elif cluster_type == 'graph':
self.clusters = self.clusters_graph == number
else:
self.clusters = None
raise Exception('Unknown cluster type.')

def _only_mark_first_max_num_features(self):
"""Only use the x most abundant taxa in a given cluster."""
if self.max_num_features is None:
if self.num_per_group is None:
return
count = 0
for i in range(self.clusters.size):
if self.clusters[i]:
count += 1
if count > self.max_num_features:
if count > self.num_per_group:
self.clusters[i] = False

def use_splits(self, splits):
Expand All @@ -197,6 +249,9 @@ def get_metadata(self, dataframe, attribute):

def _load_data(self, config):
data_raw, meta, func_tax, clusters_func, functions = load_data(config)
data_raw = data_raw[:self.max_num_features]
func_tax = func_tax[:self.max_num_features]
clusters_func = clusters_func[:self.max_num_features]

data_smooth = smooth(data_raw, factor = config['smoothing_factor'])
data_transformed, mean, std, min, max, transform_type = transform(data_smooth, transform = config['transform'])
Expand All @@ -209,6 +264,7 @@ def _load_data(self, config):
columns=func_tax[:,0])

self.data_raw = data_raw # N * T
self.data_transformed = data_transformed
self.func_tax = func_tax
self.meta = meta
self.transform_mean = mean
Expand Down
Loading