-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel3.py
86 lines (66 loc) · 4.25 KB
/
model3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
from main import *
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
import numpy as np
# Step 1: Import and Clean Data
data_set = import_and_clean_data("data.csv", ['customer_id'])
# Step 2: Convert relevant columns to numeric
data_set = convert_row_to_numeric(['age','income', 'monthly_minutes', 'monthly_data_gb', 'support_tickets', 'monthly_bill', 'outstanding_balance', 'churn'], data_set)
# Step 3: Remove outliers based on normal distribution for specified columns
data_set, normal_outliers = remove_outliers_normal_distribution_data(data_set, ['age', 'monthly_minutes', 'outstanding_balance'])
# Step 4: Remove outliers for skewed distributions
data_set, skewed_outliers = remove_outliers_skewed_distributions(data_set, ['income', 'monthly_data_gb', 'support_tickets', 'monthly_bill'])
# Step 5: One-Hot Encode categorical variables (e.g., 'region')
data_set = pd.get_dummies(data_set, columns=['region'], drop_first=True)
print(data_set.info())
# Step 6: splits the data according to year
year1_data, year2_data, year3_data = divide_into_years(data_set)
# Step 7: Over sampling every year's data and normalise it. x are features and y target
x_1, y_1 = handle_class_imbalance(year1_data, "churn")
x_2, y_2 = handle_class_imbalance(year2_data, "churn")
x_3, y_3 = handle_class_imbalance(year3_data, "churn")
# x_1 = pd.DataFrame(x_1, columns=["age", "income", "monthly_minutes", "monthly_data_gb", "support_tickets ", "monthly_bill", "outstanding_balance", "churn", "z_score_age",
# "z_score_monthly_minutes", "z_score_outstanding_balance", "region_North", "region_South", "region_West"])
# x_2 = pd.DataFrame(x_2, columns=["age", "income", "monthly_minutes", "monthly_data_gb", "support_tickets ", "monthly_bill", "outstanding_balance", "churn", "z_score_age",
# "z_score_monthly_minutes", "z_score_outstanding_balance", "region_North", "region_South", "region_West"])
# x_3 = pd.DataFrame(x_3, columns=["age", "income", "monthly_minutes", "monthly_data_gb", "support_tickets ", "monthly_bill", "outstanding_balance", "churn", "z_score_age",
# "z_score_monthly_minutes", "z_score_outstanding_balance", "region_North", "region_South", "region_West"])
# Step 8: Generate gradually increasing weights for each year's data
weights_1 = np.linspace(1, 2, len(y_1)) # Year 1 weights from 1 to 2
weights_2 = np.linspace(2, 3, len(y_2)) # Year 2 weights from 2 to 3
weights_3 = np.linspace(3, 3.5, len(y_3)) # Year 3 weights from 3 to 3.5
# Step 9: Online Learning with SGD for Year 1
sgd_model_year1 = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model_year1.fit(x_1, y_1, sample_weight=weights_1)
# Step 10: Update the model with Year 2 data
sgd_model_year1.partial_fit(x_2, y_2, sample_weight=weights_2)
#step 11: Train separate models for year2 and year3
sgd_model_year2 = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model_year3 = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model_year2.fit(x_2, y_2, sample_weight=weights_2)
sgd_model_year3.fit(x_3, y_3, sample_weight=weights_3)
#step 12: Combine the models using Voting Classifier for ensemble
ensemble_model = VotingClassifier(estimators=[
('sgd_year1', sgd_model_year1),
('sgd_year2', sgd_model_year2),
('sgd_year3', sgd_model_year3)
], voting='hard') # 'soft' voting for probability averaging
# Step 12: Train the ensemble model on the combined data
# Combine Year 1 and Year 2 data
x_1_2 = np.concatenate([x_1, x_2])
y_1_2 = np.concatenate([y_1, y_2])
weights_1_2 = np.concatenate([weights_1, weights_2])
ensemble_model.fit(x_1_2, y_1_2)
# Step 13: Predict on Year 3 using the ensemble model
y3_pred_ensemble = ensemble_model.predict(x_3)
# Evaluate Model Performance on Year 3
ensemble_accuracy = accuracy_score(y_3, y3_pred_ensemble)
ensemble_precision = precision_score(y_3, y3_pred_ensemble)
ensemble_recall = recall_score(y_3, y3_pred_ensemble)
ensemble_f1 = f1_score(y_3, y3_pred_ensemble)
print(f"Ensemble Model - Accuracy: {ensemble_accuracy}")
print(f"Ensemble Model - Precision: {ensemble_precision}")
print(f"Ensemble Model - Recall: {ensemble_recall}")
print(f"Ensemble Model - F1 Score: {ensemble_f1}")