-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproc_waseem.py
52 lines (37 loc) · 1.67 KB
/
proc_waseem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import os
file_path_in = "./../processed_data/Waseem"
file_name_in = "Waseem.csv"
tr_file_path = './../translated_data/Waseem'
tr_file_name = 'Waseem_en2ko.csv'
save_path = './Waseem'
save_name = 'Waseem'
def file_checker(path):
if not os.path.exists(path):
raise (FileNotFoundError("File doesn't exist: {}".format(path)))
def set_target(df):
if df['Target'] == 0:
return '0'
else:
return '1'
if __name__ == '__main__':
origin_path = os.path.join(file_path_in, file_name_in)
file_checker(origin_path)
translation_path = os.path.join(tr_file_path, tr_file_name)
file_checker(translation_path)
origin_df = pd.read_csv(origin_path) # [Dataset, Id, Context=Nan, Comment, Target 0 1, Annotation]
trans_df = pd.read_csv(translation_path) # [Id, Context, Comment, Target]
print(origin_df.columns)
print(trans_df.columns)
merged_df = pd.merge(origin_df[['Dataset', 'Id', 'Annotation']],
trans_df[['Id', 'Context', 'Comment', 'Target']],
on='Id', how='inner')
merged_df['Target'] = merged_df.apply(set_target, axis=1)
merged_df['Context'] = ""
# print(merged_df.loc[0])
merged_df = merged_df[['Dataset', 'Id', 'Context', 'Comment', 'Target', 'Annotation']]
save_file_name = os.path.join(save_path, 'origin_{}.csv'.format(save_name))
merged_df.to_csv(save_file_name, index=False)
new_df = merged_df[['Dataset', 'Id', 'Context', 'Comment', 'Target']]
save_file_name = os.path.join(save_path, 'model_{}.csv'.format(save_name))
new_df.to_csv(save_file_name, index=False)