-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
85 lines (67 loc) · 3.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import joblib
# Import custom modules
from data_processing import load_data, preprocess_data
from model_training import balance_classes, tune_hyperparameters
from evaluate_models import evaluate_model, visualize_results
# Configure logging
logging.basicConfig(level=logging.INFO)
def main(file_path):
"""Main function to execute the workflow."""
# Load and preprocess the data
data = load_data(file_path)
logging.info("Initial Data Shape: %s", data.shape)
# Preprocess the data and separate features and target variable
data, target = preprocess_data(data)
logging.info("Processed Data Shape: %s", data.shape)
X = data
y = target
# Balance classes and split the dataset into train and test sets
X_balanced, y_balanced = balance_classes(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
# Initialize models for comparison
models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"SVM": svm.SVC(probability=True),
"XGBoost": XGBClassifier(use_label_encoder=False),
"Logistic Regression": LogisticRegression(max_iter=1000)
}
results = {}
# Evaluate each model
for name, model in models.items():
print(f"Training {name}...")
if name in ["Random Forest", "XGBoost"]:
model_pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', model)
])
best_model = tune_hyperparameters(name, model_pipeline, X_train, y_train)
accuracy, auc_roc = evaluate_model(best_model, X_test, y_test)
results[name] = {"Accuracy": accuracy, "AUC-ROC": auc_roc}
print(f"{name} (Tuned) - Accuracy: {accuracy:.2f}%, AUC-ROC: {auc_roc:.2f}")
else:
model.fit(X_train, y_train)
accuracy, auc_roc = evaluate_model(model, X_test, y_test)
results[name] = {"Accuracy": accuracy, "AUC-ROC": auc_roc}
print(f"{name} - Accuracy: {accuracy:.2f}%, AUC-ROC: {auc_roc:.2f}")
visualize_results(results)
# Best model based on AUC-ROC score
best_model_name = max(results.items(), key=lambda x: x[1]['AUC-ROC'])[0]
print(f"The best model is {best_model_name} with an AUC-ROC of {results[best_model_name]['AUC-ROC']:.2f}.")
# Save the best model to disk
best_model_instance = models[best_model_name]
joblib.dump(best_model_instance, f'{best_model_name.replace(" ", "_").lower()}_model.pkl')
print(f"Model saved as '{best_model_name.replace(' ', '_').lower()}_model.pkl'.")
# Execute main function when script is run directly
if __name__ == "__main__":
main('bank.csv')