-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfuncs.py
349 lines (304 loc) · 11.5 KB
/
funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
# import dependencies
from ctypes import windll
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import utils
## function to read data
## check for data quality issues
def read_and_qa(pth_str):
# check if path exists
if os.path.exists(pth_str):
# read file
df = pd.read_csv(pth_str)
# take 10000 random rows
# df = df.iloc[np.random.choice(np.arange(len(df)), size=10000)]
# df.reset_index(drop = True, inplace = True)
df = df.copy()
##############################
# check for nulls
data_types = []
non_nulls = []
nulls = []
null_column_percent = []
null_df_percent = []
df_cols = df.columns
print("There are {} columns and {} records in the dataframe.".format(len(df_cols), len(df)))
# loop through columns and capture the variables above
print("Extracting count and percentages of nulls and non nulls")
for col in df_cols:
# extract null count
null_count = df[col].isna().sum()
nulls.append(null_count)
# extract non null count
non_null_count = len(df) - null_count
non_nulls.append(non_null_count)
# extract % of null in column
col_null_perc = 100 * null_count/len(df)
null_column_percent.append(col_null_perc)
if null_count == 0:
null_df_percent.append(0)
else:
# extract % of nulls out of total nulls in dataframe
df_null_perc = 100 * null_count/df.isna().sum().sum()
null_df_percent.append(df_null_perc)
# capture data types
data_types.append(df[col].dtypes)
else:
raise utils.InvalidFilePath(pth_str)
# create zipped list with column names, data_types, nulls and non nulls
lst_data = list(zip(df_cols, data_types, non_nulls, nulls, null_column_percent, null_df_percent))
# create dataframe of zipped list
df_zipped = pd.DataFrame(lst_data, columns = ['Feature', 'DataType', 'CountOfNonNulls', 'CountOfNulls',\
'PercentOfNullsInColumn', 'PercentOfNullsInData'])
return df, df_zipped
def unique_vals_counts(df: DataFrame) -> DataFrame:
"""Count unique values in dataframe.
The count is perfromed on all columns in the dataframe.
Parameters
----------
df : DataFrame
Dataframe to check for unique values per column.
Returns
-------
DataFrame
"""
if isinstance(df, pd.DataFrame):
df_cats = df.select_dtypes(include ='object')
df_cats_cols = df_cats.columns.tolist()
for col in df_cats_cols:
if 'essay' in col or 'last_online' in col:
df_cats = df_cats.drop(col, axis=1)
vals = df_cats.nunique().reset_index()
else:
raise utils.InvalidDataFrame(df)
return vals.rename(columns = {'index': 'column', 0: 'count'})
def unique_vals_column(df: DataFrame, col: str, normalize = False) -> DataFrame:
"""Count unique values in a single column in a dataframe.
Value counts are calculated for a single column and tabulated.
Parameters
----------
df : DataFrame
Dataframe containing column to check
for unique values.
col : str
Name of column to check for unique values.
normalized : bool, optional
If true this function normalizes the counts.
(Default value = False)
Returns
-------
DataFrame
"""
if isinstance(df, pd.DataFrame):
if col in df.columns:
uniques = df[col].value_counts().reset_index().rename(columns = {'index': col, col : 'count'})
if normalize:
uniques = df[col].value_counts(normalize = True).reset_index().rename(columns = {'index': col, col : 'percentOfTotal'})
else:
raise utils.InvalidColumn(col)
else:
raise utils.InvalidDataFrame(df)
return uniques
def merge_val_labels(df1, df2, col_to_merge):
"""Merge two DataFrames
Two DataFrames are merged given a shared merging column.
Parameters
----------
df1: DataFrame
First DataFrame to merge.
df2: DataFrame
Second DataFrame to merge.
col: str
name of column to merge on.
Returns
-------
DataFrame
"""
# check if data are valid dataframes
if isinstance(df1, pd.DataFrame):
if isinstance(df2, pd.DataFrame):
# extract column names
df1_cols = df1.columns.tolist()
df2_cols = df2.columns.tolist()
# check if column to merge is in both dataframe columns
if col_to_merge in df1_cols:
if col_to_merge in df2_cols:
df_merged = df1.merge(df2, how = 'left', on = col_to_merge)
else:
raise utils.InvalidColumn(col_to_merge)
else:
raise utils.InvalidColumn(col_to_merge)
else:
raise utils.InvalidDataFrame(df2)
else:
raise utils.InvalidDataFrame(df1)
return df_merged
def transform_to_cat(df, cols_to_cat):
"""Convert column values to object data type.
DataFrame columns are converted to oject data types.
Parameters
----------
df: DataFrame
Contains columns to convert to object data type.
cols_to_cat: list
List of columns names to convert to object data type.
Returns
-------
DataFrame
"""
# check for valid dataframe
if isinstance(df, pd.DataFrame):
# extract columns
df_cols = df.columns.tolist()
# check if all columns to convert are in dataframe
membership = all(col in df_cols for col in cols_to_cat)
if membership:
# convert each col to object data type
for col in cols_to_cat:
df[col] = df[col].astype(object)
else:
not_cols = []
for col in cols_to_cat:
if col not in df_cols:
not_cols.append(col)
raise utils.InvalidColumn(not_cols)
else:
raise utils.InvalidDataFrame(df)
return df
def save_transformed_data(df, filepath):
"""Save DataFrame to csv.
Save DataFrame to csv file format.
Parameters
----------
df: DataFrame
DataFrame to save to file
filepath: str
File path to save DataFrame in csv format.
Returns
-------
None
"""
if isinstance(df, pd.DataFrame):
# save file in csv format
df.to_csv(filepath, index = False)
else:
raise utils.InvalidDataFrame(df)
return None
def dim_redux(df, num_cols, threshold = 0.9):
"""Drop high correlated features.
Reduce dimensionality by dropping high correlated features based on threshold
Parameters
----------
df: DataFrame
DataFrame containing numerical features to check for correlation.
num_cols: List
Numerical columns to evaluate for correlation.
threshold: float64
Features with values above threshold are dropped.
Default(value = 0.9)
Returns
-------
DataFrame
"""
# check for valid dataframe
if isinstance(df, pd.DataFrame):
# extract dataframe columns
df_cols = df.columns.tolist()
# check if numeric columns are in dataframe
membership = all(col in df_cols for col in num_cols)
# if in dataframe
if membership:
# extract data of numeric columns
df_corr = df[num_cols]
# get absolute correlation values
corr_df = df_corr.corr().abs()
# get a mask of values
mask = np.triu(np.ones_like(corr_df, dtype = bool))
tri_df = df_corr.corr().mask(mask)
to_drop = [col for col in tri_df.columns if any(tri_df[col] > threshold)]
if len(to_drop) == 0:
print("There are no high correlated features. No column will be dropped!")
else:
for col in to_drop:
print("Dropping high correlated feature: {}".format(col))
# drop the high correlated features
reduced_df = df.drop(to_drop, axis = 1)
else:
not_cols = []
for col in num_cols:
if col not in df_cols:
not_cols.append(col)
raise utils.InvalidColumn(col)
else:
raise utils.InvalidDataFrame(df)
return reduced_df
def drop_low_var_cols(df, target_var = None, unique_val_threshold = 0.9):
"""Drop low variance categorical features.
Drops column if a normalized unique value count is greater than the threshold.
Parameters
----------
df: DataFrame
Dataframe to evaluate and drop columns if condition is true.
target_var: str
Feature to exclude from low variance analysis.
Default(value = None)
unique_val_threshold: float64
Features with values above threshold are dropped.
Default(value = 0.9)
Returns:
--------
DataFrame
"""
if isinstance(df, pd.DataFrame):
df_cols = df.columns.tolist()
to_drop = []
for col in df_cols:
if not target_var:
if df[col].dtypes == 'object':
uniques = unique_vals_column(df, col, normalize = True)
# get max value counts
max_unique = uniques['percentOfTotal'].max()
# set a threshold
if max_unique >= unique_val_threshold:
print("Dropping low variance feature: {}".format(col))
# drop columns with normalized value counts above threshold
df.drop(col, axis = 1, inplace = True)
else:
pass
if target_var:
if target_var in df_cols:
if df[col].dtypes == 'object' and col != target_var:
uniques = unique_vals_column(df, col, normalize = True)
# get max value counts
max_unique = uniques['percentOfTotal'].max()
# set a threshold
if max_unique >= unique_val_threshold:
print("Dropping low variance feature: {}".format(col))
# drop columns with normalized value counts above threshold
df.drop(col, axis = 1, inplace = True)
else:
pass
else:
raise utils.InvalidColumn(target_var)
else:
raise utils.InvalidDataFrame(df)
return df
def missing_val_selector(df, missing_val_threshold = 0):
threshold = missing_val_threshold/100
high_null_mask = df.isna().sum()/len(df) > threshold
if high_null_mask.any():
high_null_df = df.loc[:, high_null_mask]
high_null_df_cols = high_null_df.columns.tolist()
for cols in high_null_df_cols:
print("Dropping {} due to missing values count above threshold: {}".format(cols, threshold))
else:
print("There are no columns below the threshold: {}".format(threshold))
# create a mask with fewer than threshold for missing values
mask = df.isna().sum()/len(df) < threshold
red_df = df.loc[:, mask]
return red_df