-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_project_functions.py
68 lines (53 loc) · 2.59 KB
/
final_project_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def num_cols_distribution (numerical_dataframe):
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
for i in numerical_dataframe.columns:
# Crea dos gráficos de Seaborn
plt.figure(figsize=(12, 3)) # Tamaño de la figura
# Primer gráfico
plt.subplot(1, 2, 1)
sns.histplot(numerical_dataframe[i], bins = 50, kde = True, color = 'blue')
# Segundo gráfico
plt.subplot(1, 2, 2)
sns.boxplot(x = numerical_dataframe[i])
plt.tight_layout() # Ajusta el diseño para evitar superposiciones
plt.show() # Muestra la figura
def outliers (numerical_dataframe, upper_percentil, lower_percentil):
import pandas as pd
import numpy as np
# Checking how many values will be removed if the outliers are removed
for i in numerical_dataframe.columns:
iqr = np.percentile(numerical_dataframe[i],upper_percentil) - np.percentile(numerical_dataframe[i],lower_percentil)
upper_limit = np.percentile(numerical_dataframe[i],upper_percentil) + 1.5*iqr
i, round (upper_limit, 0)
def log_transfom_clean (x):
import numpy as np
import pandas as pd
if np.isfinite(x) and x!=0:
return np.log(x)
else:
return np.NAN # We are returning NaNs so that we can replace them later
def tree_pruning (x):
def tree_cleaning (dataset):
import pandas as pd
import numpy as np
# Columns to be deleted
dataset = dataset.drop (columns=['x_etrs89', 'y_etrs89', 'tipus_element', 'espai_verd', 'adreca', 'nom_castella', 'nom_catala',
'data_plantacio', 'tipus_aigua', 'tipus_reg', 'geom', 'catalogacio', 'codi_barri', 'nom_barri', 'codi_districte'])
# Rename columns
dataset = dataset.rename(columns={'codi': 'ID', 'latitud': 'LATITUDE',
'longitud': 'LONGITUDE', 'cat_especie_id': 'SPECIES_ID',
'nom_cientific': 'CIENTIFIC_NAME', 'categoria_arbrat': 'CATEGORY',
'nom_districte': 'DISTRICT'})
# Checking for null values
nulls = pd.DataFrame(round(dataset.isna().sum()/len(dataset)*100, 2))
nulls = nulls.reset_index()
nulls.columns = ['Column Name', 'Percentage Null Values']
nulls.sort_values(by='Percentage Null Values', ascending = False)
# If the resulting dataframe contains less than 1% of NaN values, they will be removed.
for i in range (len(nulls)):
if 0 < nulls['Percentage Null Values'].iloc[i] < 1:
dataset.dropna(inplace=True)
return dataset