-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocess.py
executable file
·79 lines (60 loc) · 2.98 KB
/
dataprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import pandas as pd
import json
class preprocessing:
def __init__(self, outputdir = "./result/"):
"""
init context
"""
if not os.path.exists(outputdir):
os.mkdir(outputdir)
self.outputdir = outputdir
self.dataframes_dict = {}
self.keywords_dict = {}
self.filenames_dict = {}
# load config json
with open("config.json", "r", encoding="utf-8") as load_conf:
config_json = json.load(load_conf)
self.websites_conf = config_json["websites"]
self.data_compress_type = config_json["data_compress_type"]
self.data_columns_name = config_json["data_columns_name"]
self.data_delimiter = config_json["data_delimiter"]
for website, website_info in self.websites_conf.items():
self.keywords_dict[website] = website_info["keywords"]
self.dataframes_dict[website] = pd.DataFrame(columns=['time', 'domain_name'])
file_name = self.outputdir + '/' + website_info["res_filename"]
self.filenames_dict[website] = file_name
#create file to save data
with open(file_name, "w") as csv_fd:
csv_fd.write("time,domain_name\n")
def save_data(self):
for website, dataframe in self.dataframes_dict.items():
dataframe.to_csv(self.filenames_dict[website], mode = 'a', index = False, header = False)
# empty the dataframe
dataframe.drop(dataframe.index, inplace=True)
print("data saved.")
def process_dataframe(self, df_data):
for website, keywords in self.keywords_dict.items():
osn_dataframe = df_data[df_data["domain_name"].str.contains(keywords, na = False)]
df_data = df_data.drop(index = osn_dataframe.index)
# concat
self.dataframes_dict[website] = pd.concat([self.dataframes_dict[website], osn_dataframe], ignore_index = True)
def process_data(self, data_dir_list):
# Walk through all folders in the data_dir directory
for filedir in data_dir_list:
# process data of days
file_list = os.listdir(filedir)
print("+++ processing datadir :", filedir)
dataframe = pd.DataFrame()
for file in file_list:
# read data from gzip file
df = pd.read_csv(filedir + '/' + file, \
compression = self.data_compress_type, \
delimiter = self.data_delimiter, \
names = self.data_columns_name)[["time","domain_name"]]
self.process_dataframe(df)
#save data to file
self.save_data()
if __name__ == "__main__":
date_process = preprocessing()
date_process.process_data(["./data/20220117/"])