Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions model_slo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
import joblib

df = pd.read_csv("expanded_pipeline_data_slo.csv")

y = df["violate"]
x = df.drop(columns=["violate"])

x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.2, random_state=42
)

# Logistic Regression pipeline
linear_pipeline = Pipeline(
[("scaler", StandardScaler()), ("logistic", LogisticRegression(random_state=42))]
)

linear_pipeline.fit(x_train, y_train)
lr_preds = linear_pipeline.predict(x_test)

# XGBoost pipeline
xg_pipeline = Pipeline(
[("xgb", XGBClassifier(objective="binary:logistic", random_state=42))]
)

param_dist = {
"xgb__n_estimators": randint(100, 500),
"xgb__max_depth": randint(3, 15),
"xgb__learning_rate": uniform(0.005, 0.3),
"xgb__subsample": uniform(0.5, 0.5),
"xgb__colsample_bytree": uniform(0.5, 0.5),
"xgb__reg_alpha": uniform(0, 1),
"xgb__reg_lambda": uniform(0, 1),
}

random_search = RandomizedSearchCV(
xg_pipeline,
param_distributions=param_dist,
n_iter=30,
scoring="f1",
cv=5,
verbose=1,
n_jobs=-1,
random_state=42,
)

random_search.fit(x_train, y_train)
best_model = random_search.best_estimator_
xgb_preds = best_model.predict(x_test)


def eval_model(y_true, y_pred, model_name):
print(f"Evaluation metrics for {model_name}:")
print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
print(f"Precision: {precision_score(y_true, y_pred)}")
print(f"Recall: {recall_score(y_true, y_pred)}")
print(f"F1 Score: {f1_score(y_true, y_pred)}")
print(f"Classification Report:\n{classification_report(y_true, y_pred)}\n")


eval_model(y_test, lr_preds, "Logistic Regression")
eval_model(y_test, xgb_preds, "XGBoost")

joblib.dump(best_model, "xgb_pipeline.joblib")
joblib.dump(linear_pipeline, "linear_pipeline.joblib")