-
Notifications
You must be signed in to change notification settings - Fork 37
/
make_dataset.py
82 lines (67 loc) · 3.03 KB
/
make_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import random
import math
def subtype_selection(subtype):
global subtype_flag, data_path
if subtype == 'H1N1':
subtype_flag = 0
# data_path = '/home/zh/codes/rnn_virus_source_code/data/raw/H1N1_cluster/'
elif subtype == 'H3N2':
subtype_flag = 1
# data_path = '/home/zh/codes/rnn_virus_source_code/data/raw/H3N2_cluster/'
elif subtype == 'H5N1':
subtype_flag = 2
# data_path = '/home/zh/codes/rnn_virus_source_code/data/raw/H5N1_cluster/'
elif subtype == 'COV19':
subtype_flag = 3
return subtype_flag
def read_trigram_vecs(subtype):
"""
Reads the csv file containing 100 dimensional prot vecs, the
data_path argument indicating where it is located.
Returns a dictionary that maps a 3gram of amino acids to its
index and a numpy array containing the trigram vecs.
"""
data_path = '/home/zh/codes/rnn_virus_source_code/data/raw/H1N1_cluster/'
prot_vec_file = 'protVec_100d_3grams.csv'
df = pd.read_csv(data_path + prot_vec_file, delimiter='\t')
trigrams = list(df['words'])
trigram_to_idx = {trigram: i for i, trigram in enumerate(trigrams)}
trigram_vecs = df.loc[:, df.columns != 'words'].values
return trigram_to_idx, trigram_vecs
def read_strains_from(data_files, data_path):
"""
Reads the raw strains from the data_files located by the data_path.
Returns a pandas series for each data file, contained in a ordered list.
"""
# _, data_path = subtype_selection(subtype)
raw_strains = []
for file_name in data_files:
df = pd.read_csv(data_path + file_name)
strains = df['seq']
raw_strains.append(strains)
return raw_strains
def train_test_split_strains(strains_by_year, test_split, cluster):
"""
Shuffles the strains in each year and splits them into two disjoint sets,
of size indicated by the test_split.
Expects and returns pandas dataframe or series.
"""
train_strains, test_strains = [], []
if cluster == 'random':
for strains in strains_by_year:
num_of_training_examples = int(math.floor(strains.count() * (1 - test_split)))
shuffled_strains = strains.sample(frac=1).reset_index(drop=True)
train = shuffled_strains.iloc[:num_of_training_examples].reset_index(drop=True)
test = shuffled_strains.iloc[num_of_training_examples:].reset_index(drop=True)
train_strains.append(train)
test_strains.append(test)
else:
# change the starting index for the time-series training samples for multiple experiments
for strains in strains_by_year:
num_of_training_examples = int(math.floor(strains.count() * (1 - test_split)))
train = strains.iloc[:800].reset_index(drop=True)
test = strains.iloc[800:1000].reset_index(drop=True)
train_strains.append(train)
test_strains.append(test)
return train_strains, test_strains