-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoder.py
201 lines (170 loc) · 7.84 KB
/
coder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import pandas as pd
import numpy as np
import os
import glob
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
from sklearn.preprocessing import LabelEncoder
# Used for encoding the data in the field and Trascript column
label_encoder_y_train = LabelEncoder()
label_encoder_x_train = LabelEncoder()
#y_train_encoded = label_encoder.fit_transform(y_train_labels)
datab = [] # Used to store list of dataframes used for training
# datab_val = []
# datab_val_w_ann = []
def concatenate_elements_if_gt_one(input_list): # No longer useful in this case
l = []*1
# Check if the number of elements in the list is greater than 1
if len(input_list) > 1:
# Concatenate the elements to form a single entity
concatenated_entity = " ".join(input_list)
#print(concatenated_entity)
l.append(concatenated_entity)
return l
elif len(input_list) == 1:
# Return the element as is
return input_list
else:
l = ['2019']
#l.append[2019]
return l
# Changing the column names form the original values to these to maintain uniformity in training set
new_column_names = [
"start_index", "end_index", "x_top_left", "y_top_left",
"x_bottom_right", "y_bottom_right", "transcript", "field"
]
# Changing the column names form the original values to these to maintain uniformity in testing set
new_column_names_val = [
"start_index", "end_index", "x_top_left", "y_top_left",
"x_bottom_right", "y_bottom_right", "transcript"
]
# def rename_columns_in_csv(new_column_names):
# df.columns = new_column_names
# Specify the folder path containing the CSV files
folder_path = 'dataset/dataset/train/boxes_transcripts_labels'
# Use glob to get a list of CSV file paths in the folder
csv_files = glob.glob(folder_path + '/*.tsv')
for csv_file in csv_files:
dff = pd.read_csv(csv_file, sep='\t')
#dff.columns = new_column_names
# for column in dff.columns:
# print(column)
if(dff.shape[1]<9): # Checking and cleaning the dataframes if columns are merged then remove that dataframe
del dff
continue
else:
# One column is made in dff when importing from csv so removing that column
dff = dff.drop(columns=['Unnamed: 0'])
dff.columns = new_column_names
datab.append(dff)
#print(datab)
#print(dff.head())
# Merging all the dataframes to form a new dataframe for training
combined_dff = pd.concat(datab, ignore_index = True)
folder_path_val = 'dataset/dataset/val_original/boxes_transcripts'
# Code to convert testing files but they are loaded in loop not together
"""# Use glob to get a list of CSV file paths in the folder
csv_files = glob.glob(folder_path_val + '/*.tsv')
for csv_file in csv_files:
# df_val= pd.read_csv(csv_file)
# df_val.to_csv(csv_file, sep='\t')
dff_val = pd.read_csv(csv_file, sep='\t')
#dff.columns = new_column_names
# for column in dff.columns:
# print(column)
if(dff_val.shape[1]<8):
print(csv_file)
del dff_val
continue
else:
#print('okk')
dff_val = dff_val.drop(columns=['Unnamed: 0'])
dff_val.columns = new_column_names_val
datab_val.append(dff_val)
#print(dff.head())
combined_dff_val = pd.concat(datab_val, ignore_index = True)"""
folder_path_val_w_nn = 'dataset/dataset/val_w_ann/boxes_transcripts_labels'
# # Use glob to get a list of CSV file paths in the folder
"""csv_files = glob.glob(folder_path_val_w_nn + '/*.tsv')
for csv_file in csv_files:
df_val_w_ann= pd.read_csv(csv_file)
df_val_w_ann.to_csv(csv_file, sep='\t')
dff_val_w_ann = pd.read_csv(csv_file, sep='\t')
#dff.columns = new_column_names
# for column in dff.columns:
# print(column)
if(dff_val_w_ann.shape[1]<9):
del dff_val_w_ann
continue
else:
dff_val_w_ann = dff_val_w_ann.drop(columns=['Unnamed: 0'])
dff_val_w_ann.columns = new_column_names
dff_val_w_ann.to_csv(csv_file, sep='\t')
datab_val_w_ann.append(dff_val_w_ann)
#print(dff.head())
combined_dff_val_w_ann = pd.concat(datab_val_w_ann, ignore_index = True)
print(combined_dff)"""
# for index, row in dff.iterrows():
# datab.append(dff[[row[0], row[1], row[2], row[3], row[4], row[5]]])
# datain.append(dff(row[6])) # Input is all the remaining quantities
# dataout.append(dff[row[7]]) # Output is the field column
#X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
# field = row[7].lower()
# transcript = row[6]
# Using RandomForest for classification into 15 classes available
from sklearn.ensemble import RandomForestClassifier
y_train_labels = combined_dff['field'] # We want the model to predict the field
print((y_train_labels))
y_train_encoded = label_encoder_y_train.fit_transform(y_train_labels) # Converting them to labels to give to the classifier
print(label_encoder_y_train.classes_)
X_train_trans = combined_dff['transcript'] # Only this column needs to be encoded as it has all types of values from strings to numbers to special characters
X_train_rem = combined_dff.drop(columns = ['field', 'transcript']).to_numpy() # Rest of the input_X is numerical
#print(X_train_rem)
#print(X_train_trans)
X_train_trans_encoded = label_encoder_x_train.fit_transform(X_train_trans).reshape(-1, 1)# Reshaping so that it can be combined with rest of X_input
#X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_train_combined = np.hstack((X_train_rem, X_train_trans_encoded))# Final training X_input
#print(X_train_combined)
output_folder = 'dataset/dataset/output/' # Replace with your desired output folder path
if not os.path.exists(output_folder):
os.makedirs(output_folder)
csv_files = glob.glob(folder_path_val + '/*.tsv')
for csv_file in csv_files:
#df_val= pd.read_csv(csv_file)
#df_val.to_csv(csv_file, sep='\t')
dff_val = pd.read_csv(csv_file, sep='\t')
#dff.columns = new_column_names
# for column in dff.columns:
# print(column)
if(dff_val.shape[1]<8): # Converting each .tsv val file into dataframe and then performing the similar things to give X_test_Input to the model
print(csv_file)
del dff_val
continue
else:
#print('okk')
dff_val = dff_val.drop(columns=['Unnamed: 0'])
dff_val.columns = new_column_names_val
X_test_trans = dff_val['transcript']
X_test_rem = dff_val.drop(columns = ['transcript']).to_numpy()
label_encoder_x_test = LabelEncoder()# Initializing the labelEncoder to be used in next line
X_test_trans_encoded = label_encoder_x_test.fit_transform(X_test_trans).reshape(-1, 1)
X_test = np.hstack((X_test_rem, X_test_trans_encoded))
model = RandomForestClassifier(n_estimators=20)
model.fit(X_train_combined, y_train_encoded) # Applying the RF Classifier with 20 estimators
y_test_pred = model.predict(X_test) # Predicting output which will be encoded
y_test_pred_decoded = label_encoder_y_train.inverse_transform(y_test_pred) # Converting the encoded data to original labels
dff_val['field'] = y_test_pred_decoded # Making a new column and adding all the predicted values there
#print(dff_val)
output_file = os.path.join(output_folder, os.path.basename(csv_file))
dff_val.to_csv(output_file, sep='\t', index=False)
dff_val.to_csv(csv_file, sep='\t', index=False) # Saving the output_file
"""count =0
# for element in y_test_pred:
# if(element >0):
# count+=1
# print(count)
y_test_pred_decoded = label_encoder_y_train.inverse_transform(y_test_pred)
# for element in y_test_pred_decoded:
# if(element != '!'):
# print(element)
print(y_test_pred_decoded)"""