-
Notifications
You must be signed in to change notification settings - Fork 4
/
complete_feature_extraction_script.py
96 lines (79 loc) · 3.45 KB
/
complete_feature_extraction_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os, sys, inspect
import time
import RootPath
import json
def run_script(name, generate_dict, is_test):
print(f"\nRunning script {name}")
start_time = time.time()
command = "python3 Scripts/Feature_extraction/" + name
if not generate_dict:
command += " --no_dict_generation"
if is_test:
command += " --is_test"
exit_code = os.system(command)
end_time = time.time()
print(f"Script {name} ended!\nTime needed: {end_time - start_time}s\n")
return exit_code
def do_feature_extraction(config, all_scripts, generate_dict, is_test) -> None:
"""
Main function responsible for the feature extraction of the data
"""
# add root directory for looking for modules
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
# guarantee the existence of all needed directories
for k, directory in config.items():
if 'path' in k:
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
# dump the config as a json to make it available to all scripts
with open(os.path.join(current_dir, "Scripts", "Feature_extraction", "paths.json"), "w+") as write_file:
json.dump(config, write_file, indent=4, separators=(',', ': '))
# export envvars usefult to coordinate the scripts
os.environ['PYTHONPATH'] = current_dir # project's root dir from where all imports should start
os.environ['DASK_TEMPORARY_DIRECTORY'] = config['dask_tmp_path'] # where to locate dask's swap
start = time.time()
for s in all_scripts:
exit_code = run_script(s, generate_dict, is_test)
if exit_code != 0:
print(f"Exit Code is {exit_code}")
break
end = time.time()
print(f"\n\nTime elapsed for whole script: {end-start}")
if __name__ == '__main__':
print('Python %s on %s' % (sys.version, sys.platform))
if RootPath.is_aws():
print("Detected running on AWS!")
else:
print("Running on local")
print(f"Dataset folder used: {RootPath.get_dataset_path()}")
# define base path where data can be found
# all results will be nested in this folder
preproc_dict_path = os.path.join(RootPath.get_dataset_path(), 'Preprocessed', 'Dictionary')
generate_dict = True
is_test = False
if generate_dict:
data_path = os.path.join(RootPath.get_dataset_path(), 'Preprocessed', 'Train')
base_path = os.path.join(data_path, 'FeatureExtraction')
dict_path = os.path.join(base_path, 'Dictionary')
else:
data_path = os.path.join(RootPath.get_dataset_path(), 'Preprocessed', 'Valid')
base_path = os.path.join(data_path, 'FeatureExtraction')
dict_path = os.path.join(RootPath.get_dataset_path(), 'Preprocessed', 'Train', 'FeatureExtraction', 'Dictionary')
all_scripts = [
'fe01_follower_features.py',
'fe02_user_hashtags.py',
'fe03_categorical_combo.py',
'fe20_merge_all_features.py',
'fe_32a_target_encoding_split_cols.py',
# 'fe_33_target_encoding_mapping'
]
# define all config paths needed by the subscripts
config = {
'data_path': data_path,
'base_path': base_path,
'temp_path': os.path.join(base_path, 'Temp'),
'preproc_dict_path': preproc_dict_path,
'dict_path': dict_path,
'dask_tmp_path': os.path.join(base_path, 'Temp', 'dask_tmp'),
}
do_feature_extraction(config, all_scripts, generate_dict, is_test)