-
Notifications
You must be signed in to change notification settings - Fork 0
/
additional_features_preprocessing.py
executable file
·105 lines (92 loc) · 3.91 KB
/
additional_features_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
This file
- Creates additional features using CSV scraped files;
- Clean and transforms the above features into a pandas dataframe;
- Saves the dataframe into a defined path.
"""
import glob
import joblib
import pandas as pd
from config import ADDITIONAL_FEATURES_PATH
from config import ADDITIONAL_FEATURES_DF_JOBLIB_PATH
from config import ADDITIONAL_FEATURES_DF_JOBLIB_FILE_NAME
class AdditionalFeaturesProcessing:
"""
Additional features scraping and preparation.
"""
def __init__(self):
self.features_path = ADDITIONAL_FEATURES_PATH + '/*.csv'
self.joblib_features_path = ADDITIONAL_FEATURES_DF_JOBLIB_PATH
self.joblib_features_filename = ADDITIONAL_FEATURES_DF_JOBLIB_FILE_NAME
self.dataframe = pd.DataFrame(columns=['stars', 'comment'])
self.tmp_list = list()
def feature_loader(self) -> pd.DataFrame:
"""
Function to load from a directory full of CSVs
and create a pandas dataframe
"""
for file_name in glob.glob(self.features_path):
data = pd.read_csv(file_name)
self.dataframe = self.dataframe.append(data)
self.dataframe = self.dataframe.reset_index(drop=True)
self.dataframe['comment'].str.strip()
return self.dataframe
def data_cleaner(self) -> pd.DataFrame:
"""
Function to remove leading and trailing characters
in the comment column.
"""
self.dataframe['comment'] = self.dataframe['comment'].str.strip()
return self.dataframe
def stars_column_int_converter(self) -> pd.DataFrame:
"""
Function to convert the sentences to integer for model purposes.
Example input: "4,0 su 5 stelle"
Output: 4
"""
self.dataframe['stars'] = self.dataframe['stars'].str[0].astype(int)
return self.dataframe
def five_stars_remover(self) -> pd.DataFrame:
"""
NOT USED ANYMORE AS THE MODEL IMPROVES BETTER IF DATA KEEPS BEING SKEWED.
Function to remove the 5 stars reviews from the additional data.
This is needed as the dataset is unbalanced (very high on 5 stars review).
Hence, we want to increment the amount of 1-4 stars reviews only.
"""
self.dataframe.drop(self.dataframe.index[self.dataframe['stars'] == 5],
inplace=True)
self.dataframe = self.dataframe.reset_index(drop=True)
return self.dataframe
def rename_dataframe_columns(self) -> pd.DataFrame:
"""
Function to rename the dataframe columns according to
the dataset we are going to merge the output of this script with.
In details, the column names need to be processed_sentence and score
"""
self.dataframe.rename(columns={'stars': 'score',
'comment': 'processed_sentence'},
inplace=True)
return self.dataframe
def show_distinct_count_of_scores(self):
"""
Check class distribution (how much reviews the dataset includes for each possible score.
"""
score_distribution = self.dataframe.groupby('score')['processed_sentence'].nunique()
print(score_distribution)
return score_distribution
def save_dataframe_to_joblib(self) -> pd.DataFrame:
"""
Function to save the created dataframe into a joblib file.
"""
print('Calling dataframe from word processing', self.dataframe)
joblib.dump(self.dataframe, self.joblib_features_path + self.joblib_features_filename)
return self.dataframe
if __name__ == '__main__':
PROCESSING = AdditionalFeaturesProcessing()
PROCESSING.feature_loader()
PROCESSING.data_cleaner()
PROCESSING.stars_column_int_converter()
# PROCESSING.five_stars_remover()
PROCESSING.rename_dataframe_columns()
PROCESSING.show_distinct_count_of_scores()
PROCESSING.save_dataframe_to_joblib()