-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
98 lines (68 loc) · 2.69 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# FUNCTIONS FOR ALL! :D
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import ConfusionMatrixDisplay
# Get to know the data
def df_info(df):
info = print(df.isna().sum(), df.info(), df.shape, list(df.columns), sep="\n")
return(info)
# Nulls percent
def nulls_percent(df):
'''
Shows percent of nulls in a data frame.
Args:
df: The dataframe we want to check out.
Returns:
A new df with 2columns:
- 'column_name' with the name of the original df columns
- 'nulls_percentage' with the percentage of nulls in every column
'''
nulls_percent = pd.DataFrame(df.isna().sum()/len(df)).reset_index()
nulls_percent.columns = ['column_name', 'nulls_percentage']
return nulls_percent
# Plot Numerical distributions
def plot_distributions(df):
for col in df:
sns.displot(data[col])
plt.show()
# Standarizate column headers
def standarizate_cols (df):
'''
Standarizes column names
- Sets cols to lowercase.
- Replaces empty space for '_'.
Args:
df: The dataframe to be standarized.
Returns:
A df that has been standarized.
'''
df = df.columns.str.lower()
df = df.columns.str.replace(' ', '_')
df = pd.DataFrame(df)
return df
# Unique Values for Categorical Columns
def unique_values(df):
categories = df.select_dtypes('O').columns.astype('category')
for i in df[categories].columns:
print(f'The categorical column {i} has {len(df[i].value_counts())} unique values')
# Plotting Results with CM
def plot_results(y_test,y_pred_test):
print("The accuracy in the TEST set is: {:.2f}".format(accuracy_score(y_test,y_pred_test)))
print("The precision in the TEST set is: {:.2f}".format(precision_score(y_test,y_pred_test, pos_label=1)))
print("The recall in the TEST set is: {:.2f}".format(recall_score(y_test,y_pred_test, pos_label=1)))
print("The F1 in the TEST set is: {:.2f}".format(f1_score(y_test,y_pred_test, pos_label=1)))
print("The Kappa in the TEST set is: {:.2f}".format(cohen_kappa_score(y_test,y_pred_test)))
cm_test = confusion_matrix(y_test,y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test)
disp.plot()
plt.show()