-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
163 lines (125 loc) · 4.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import numpy as np
import pandas as pd
from constants import *
import tensorflow as tf
import pickle
import json
import argparse
from sklearn.model_selection import train_test_split
import sys
def read_data(path, show_labels=False):
"""
Read the data file and converts to a list.
:param str path: Path to the document
:param bool show_labels: Show all labels used in data
:return: DataFrame
"""
data = pd.read_csv(path)
data = data.dropna(subset=[VALUE]) # Remove null values
data = data.loc[:, [VALUE, LABEL]] # Keep these columns
data = data.loc[(data[LABEL] != '\\N')] # Keeping only labeled comments
if show_labels:
print(data[LABEL].unique())
return data
def normalize_data_distribution(data):
"""
Equally distribute data according to the minimum available class (label).
:param pandas.core.frame.DataFrame data: Input data to distributed
:return: Equally distributed data
"""
dist = data.groupby(LABEL).nunique()
min_label = min(dist[VALUE]) # Get the value of the rarest available label
normalized_data = data.groupby(LABEL).head(n=min_label) # Now data are equally distributed
return normalized_data
def encode_labels(labels):
"""
Encode labels to categorical type with Keras.
:param pandas.core.series.Series labels: Original labels of input data
:return: Encoded labels
"""
labels_array = np.array(labels)
y = []
for i in range(len(labels_array)):
if labels_array[i] == 'no_idea':
y.append(0)
if labels_array[i] == 'not_recommended':
y.append(1)
if labels_array[i] == 'recommended':
y.append(2)
y = np.array(y)
labels_array = tf.keras.utils.to_categorical(y, 3, dtype="float32")
del y
return labels_array
def save_model(fn, obj):
"""
Save a given model for later uses.
:param str fn: Name of file to be saved
:param obj: Data to be saved
:return:
"""
with open(fn, 'wb') as file:
pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
def read_config(fn, perm="r"):
"""
Read config file which consists parameters used in program.
:param str fn: Name of config file (config.json)
:param str perm: Mode in which the file is opened
:return: Config file paramteres
"""
with open(fn, perm) as file:
config = json.load(file)
return config
def load_tokens(fn, perm="rb"):
"""
Read config file which consists parameters used in program.
:param str fn: Name of file
:param str perm: Mode in which the file is opened
:return: Keras tokenizer object consisting extracted tokens
"""
with open(fn, perm) as file:
tokens = pickle.load(file)
return tokens
def split_data(data, labels):
"""
Split dataset to train and test sets.
:param data: Input data
:param labels: Corresponding labels of data
:return: List containing train-test split of input data
"""
X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=0)
return X_train, X_test, y_train, y_test
def init_app(args):
"""
The entry of program. Check if program is launched from terminal & Process data.
:param args: Arguments passed from CLI
:return: Processed data and mode of program (Train or test). Exit if none.
"""
print("Initializing...")
is_cli_mode = sys.stdout.isatty()
cli_args = cli_mode()
mode = (cli_args.mode.lower() if is_cli_mode else args["mode"])
model = (cli_args.model.lower() if is_cli_mode else args["model"])
if mode == "train":
data = read_data(TRAIN_DATA)
data = normalize_data_distribution(data)
return data, mode, model
elif mode == "test":
data = read_data(TEST_DATA)
return data, mode, model
# elif mode == "stats":
# show_model_info(args.model_name)
# exit()
else:
sys.exit("Selected mode is wrong. (train or test)")
# print("Loading {0} mode...".format(mode))
def cli_mode():
"""
Configurations fo CLI to run program with. Will be set to default values if run in other environments.
:return: Parsed arguments from CLI
"""
parser = argparse.ArgumentParser(description="Manual to use this script:", usage="python main.py mode model")
parser.add_argument('mode', type=str, nargs='?', default="test", help='Choose whether you want to train a model '
'or test one')
parser.add_argument('model', type=str, nargs='?', default="lstm", help='Choose the model you wish to train/test')
args = parser.parse_args()
return args