-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataCleaning1.py
377 lines (304 loc) · 14.3 KB
/
DataCleaning1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
import seaborn as sns
import pandas as pd
import missingno as msno
import numpy as np
class DataCleaner:
def __init__(self, source, df, file_name, problem_type, type_column, dependent_var):
self.source = source
self.file_name = file_name
self.data = df
self.problem_type = problem_type
self.type_column = type_column
self.dependent_var = dependent_var
self.problem_type_identify()
def problem_type_identify(self):
try:
print("I am in problem type identifier function")
# Call the appropriate EDA method based on problem type
if self.problem_type.lower() == "time series":
self.time_series()
elif self.problem_type.lower() == "categorical":
self.categorical()
elif self.problem_type.lower() == "numerical":
self.numerical()
else:
raise ValueError("Invalid problem type. Supported types are 'time series', 'categorical', and 'numerical'.")
except Exception as e:
print(f"Error occurred in problem type identification: {e}")
def numerical(self):
# print("cleaning")
# self.handle_missing_values(method='drop')
# self.handle_duplicates()
# self.remove_high_cardinality_and_constant_columns()
# self.handle_outliers(threshold_percentage=2)
# self.reset_index()
# self.msno_plot()
print("cleaning")
self.remove_high_cardinality_and_constant_columns()
self.handle_outliers(threshold_percentage=1)
self.data = self.identify_and_remove_outliers(k=1.5)
self.drop_columns()
# handling categorical missing values
self.handle_cat_missing_values(method = 'fillna_others')
# handling numerical missing values
self.handle_missing_values(method= 'fillna_mean')
self.handle_duplicates()
self.reset_index()
self.msno_plot()
self.reset_index()
#removing the datetime columns
# self.remove_datetime_columns()
# self.data.info()
def categorical(self):
print("cleaning")
self.remove_high_cardinality_and_constant_columns()
self.handle_outliers(threshold_percentage=1)
# self.data = self.identify_and_remove_outliers(k=1.5)
self.drop_columns()
# handling categorical missing values
self.handle_cat_missing_values(method = 'fillna_others')
# handling numerical missing values
self.handle_missing_values(method= 'fillna_mean')
self.handle_duplicates()
self.reset_index()
self.msno_plot()
self.reset_index()
self.remove_datetime_columns()
def get_data(self):
return self.data
def time_series(self):
# self.plot_resampled_time_series()
# self.data = self.identify_and_remove_outliers(k=1.5)
self.drop_columns()
# print("____ Missing Numbers after Outliers Treatment_____")
# self.msno_plot()
if self.data.isnull().any().any():
self.data.fillna(method='ffill', inplace = True)
self.data.fillna(method='bfill', inplace=True)
self.data.fillna(self.data.mean(), inplace=True)
self.handle_duplicates()
# self.handle_datetime(format='%Y-%m-%d %H:%M:%S')
# self.reset_index()
print("____ Dataset after missing values Treatment_____")
self.msno_plot()
self.remove_high_cardinality_and_constant_columns()
def remove_high_cardinality_and_constant_columns(self, threshold=90):
"""
Remove high-cardinality categorical columns and constant columns.
Parameters:
- df: pandas DataFrame
The input DataFrame.
- threshold: int, optional (default=90)
The threshold for the number of unique values. Columns with more than
this number of unique values will be removed.
Returns:
- df_filtered: pandas DataFrame
The DataFrame with high-cardinality categorical columns and constant columns removed.
"""
# Identify categorical columns
categorical_columns = self.data.select_dtypes(include=['object', 'category']).columns
# Filter columns with high cardinality
high_cardinality_columns = [col for col in categorical_columns if self.data[col].nunique() > threshold]
# Identify constant columns
constant_columns = [col for col in self.data.columns if self.data[col].nunique() == 1]
# Combine columns to remove
columns_to_remove = set(high_cardinality_columns + constant_columns)
# Drop columns
df_filtered = self.data.drop(columns=columns_to_remove)
self.data = df_filtered
def remove_datetime_columns(self):
"""
Removes datetime columns from a DataFrame.
Parameters:
- df (DataFrame): Input DataFrame containing datetime columns.
Returns:
- df (DataFrame): DataFrame with datetime columns removed.
"""
# Identify datetime columns
datetime_columns = []
for column in self.data.columns:
try:
if self.data[column].dtype == 'datetime64[ns]':
pd.to_datetime(self.data[column])
datetime_columns.append(column)
except ValueError:
pass
# Remove datetime columns from DataFrame
self.data.drop(columns=datetime_columns,inplace=True,errors='ignore')
return self.data
def drop_columns(self, threshold=0.5):
"""
Drop columns with missing values exceeding a specified threshold.
Parameters:
- threshold (float): Threshold for the percentage of missing values.
"""
cols_to_drop = self.data.columns[self.data.isnull().mean() > threshold]
self.data.drop(columns=cols_to_drop, inplace=True)
id_patterns = ['id', 'ID', 'Id', '_id', 'Identifier', 'identifier']
# Find column names that contain any of the patterns.
id_columns = [col for col in self.data.columns if any(pattern in col for pattern in id_patterns)]
# Drop the identified columns from the DataFrame.
self.data.drop(columns=id_columns,inplace=True, errors='ignore') # errors='ignore' allows dropping non-existing columns without error
def identify_and_remove_outliers(self, k=1.5):
"""
Identify and remove outliers from a time series using the Interquartile Range (IQR) method.
Parameters:
- time_series_data (pd.Series): The time series data to process.
- k (float): The multiplier for determining the outlier threshold. Default is 1.5.
Returns:
- pd.Series: The time series data with outliers removed.
"""
time_series_data = self.data
# Calculate quartiles and IQR
Q1 = time_series_data.quantile(0.25)
Q3 = time_series_data.quantile(0.75)
IQR = Q3 - Q1
# Define the outlier threshold range
lower_bound = Q1 - k * IQR
upper_bound = Q3 + k * IQR
# Identify outliers
outliers = (time_series_data < lower_bound) | (time_series_data > upper_bound)
# Replace outliers with NaN or choose another method (e.g., imputation)
time_series_data[outliers] = np.nan
# Alternatively, you can remove the outliers by filtering
# filtered_data = time_series_data[~outliers]
self.data = time_series_data
return time_series_data
def handle_outliers(self, threshold_percentage = 1):
# Separate numerical and categorical columns
numerical_columns = self.data.select_dtypes(include=np.number).columns
categorical_columns = self.data.select_dtypes(include='object').columns
# Identify and handle outliers for numerical columns
for col in numerical_columns:
# Identify outliers using IQR
Q1 = self.data[col].quantile(0.25)
Q3 = self.data[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Handle outliers by capping extreme values
self.data[col] = self.data[col].clip(lower=lower_bound, upper=upper_bound)
# Identify and handle outliers for categorical columns
for col in categorical_columns:
# Identify rare categories based on a dynamic threshold
threshold_count = len(self.data) * threshold_percentage / 100
category_counts = self.data[col].value_counts()
rare_categories = category_counts[category_counts < threshold_count].index
# Handle rare categories by replacing them with a common label or removing them
self.data[col] = self.data[col].replace(rare_categories, 'Other')
# Visualize the impact of outlier handling for numerical columns
plt.figure(figsize=(15, 6))
sns.boxplot(data=self.data[numerical_columns])
plt.xticks(rotation=90)
plt.title('Boxplot After Handling Numerical Outliers')
plt.show()
# Visualize the impact of outlier handling for categorical columns
for col in categorical_columns:
plt.figure(figsize=(10, 6))
sns.countplot(x=col, data=self.data)
plt.xticks(rotation=90)
plt.title(f'Count Plot for {col} After Handling Outliers')
plt.show()
def msno_plot(self):
msno.bar(self.data) # you can see pandas-profilin count part
plt.title('Count of Values per Column in Dataset for Missing value Analysis', size=16)
plt.show()
def handle_missing_values(self, method= 'drop'):
"""
Handle missing values in the dataset.
Parameters:
- method (str): Method for handling missing values. Options: 'drop', 'fillna_mean', 'fillna_median'.
"""
if method == 'drop':
self.data.dropna(inplace=True)
elif method == 'fillna_mean':
self.data.fillna(self.data.mean(), inplace=True)
elif method == 'fillna_median':
self.data.fillna(self.data.median(), inplace=True)
elif method == 'linear':
self.data.interpolate(method='linear', inplace = True)
else:
raise ValueError("Invalid method. Use 'drop', 'fillna_mean', 'fillna_median' or 'linear'.")
# df_filled = self.data.fillna(method='ffill')
return self.data
def handle_duplicates(self):
"""
Remove duplicate rows from the dataset.
"""
self.data.drop_duplicates(inplace=True)
# def handle_outliers(self, method='z-score', threshold=3):
# """
# Handle outliers in the dataset.
# Parameters:
# - method (str): Method for handling outliers. Options: 'z-score', 'IQR'.
# - threshold (float): Threshold for identifying outliers.
# """
# numeric_columns = self.data.select_dtypes(include=[np.number]).columns
# for col in numeric_columns:
# if method == 'z-score':
# z_scores = (self.data[col] - self.data[col].mean()) / self.data[col].std()
# self.data = self.data[(np.abs(z_scores) < threshold)]
# elif method == 'IQR':
# Q1 = self.data[col].quantile(0.25)
# Q3 = self.data[col].quantile(0.75)
# IQR = Q3 - Q1
# self.data = self.data[~((self.data[col] < (Q1 - 1.5 * IQR)) | (self.data[col] > (Q3 + 1.5 * IQR)))]
# else:
# raise ValueError("Invalid method. Use 'z-score' or 'IQR'.")
def handle_datetime(self, format='%Y-%m-%d %H:%M:%S'):
"""
Convert the first datetime column found to datetime format.
Parameters:
- format (str): Format of the datetime column.
"""
for col in self.data.columns:
try:
self.data[col] = pd.to_datetime(self.data[col], format=format)
break # Stop after the first datetime column is found and converted
except (TypeError, ValueError):
pass # Continue if the conversion fails for the current column
else:
print("No datetime column found in the dataset.")
def reset_index(self):
"""
Reset the index of the DataFrame.
"""
self.data.reset_index(drop=True, inplace=True)
def handle_cat_missing_values(self, method='fillna_mode', fill_value = "others" ):
for col in self.data.select_dtypes(include='object').columns:
if method == 'drop':
self.data.dropna(inplace=True)
elif method == 'fillna_mode':
self.data.fillna(self.data.mode().iloc[0], inplace=True) # Using iloc[0] to handle cases where mode returns multiple values
elif method == 'fillna_others':
self.data[col].fillna(fill_value, inplace=True)
def plot_resampled_time_series(self, date_col='date', value_col='value'):
"""
Plot time series graph after automatic resampling based on data's time range.
Parameters:
- data (DataFrame): The input time series data.
- date_col (str): The name of the column containing date values.
- value_col (str): The name of the column containing numeric values.
Returns:
- None
"""
df = self.data
datetime_column = self.type_column[0]
# Convert the datetime column to a pandas datetime object
df[datetime_column] = pd.to_datetime(df[datetime_column])
# Set the datetime column as the index
df.set_index(datetime_column, inplace=True)
# Downsample with Mean
resampled_df = df.resample('D').mean()
# Plot original and resampled data
plt.figure(figsize=(12, 6))
plt.plot(df.index, df[self.dependent_var], label='Original Data', marker='o')
plt.plot(resampled_df.index, resampled_df[self.dependent_var], label='Resampled Data', marker='o')
plt.title('Original vs Resampled Time Series Data')
plt.xlabel('Date')
plt.ylabel('Your Column')
plt.legend()
plt.show()
self.data = df