-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprepare_data.py
83 lines (62 loc) · 1.89 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import glob
import os
import pandas as pd
import remove_redundancy
import argparse
'''
This script transforms event chains to per day files
'''
parser = argparse.ArgumentParser()
parser.add_argument(
'--groundtruthchains',
default='ground_truth_chains/',
type=str,
help='Input directory for input event chains'
)
parser.add_argument(
'--perdaydata',
default='per_day_data/',
type=str,
help='Output directory for per day data'
)
parser.add_argument(
'--redundancyremoveddata',
default='redundancy_removed_chains/',
type=str,
help='Output directory for redundancy removed chains'
)
def main(args):
input_dir = args.groundtruthchains
output_dir = args.redundancyremoveddata
print("Removing redundancy")
remove_redundancy.run(input_dir, output_dir)
input_dir = output_dir
output_dir = args.perdaydata
print("Preparing per day files")
path = input_dir
file_name = '*.csv'
all_files = glob.glob(os.path.join(path, file_name))
per_day_data = {}
for f in all_files:
df = pd.read_csv(f, header=None, encoding='latin-1')
df_list = df.values.tolist()
for row in df_list:
try:
day = row[0][0:8]
if day not in per_day_data:
per_day_data[day] = []
per_day_data[day].append(row)
except:
continue
for key in per_day_data:
df = pd.DataFrame(per_day_data[key])
df.sort_values(by=[0], inplace=True)
df.to_csv(output_dir + key + '.csv', sep=',', index=0, header=None)
days = sorted(per_day_data.keys())
days.sort()
with open('days.txt', 'w') as f:
for item in days:
f.write("%s\n" % item)
if __name__ == "__main__":
args = parser.parse_args()
main(args)