From 8e2f1ed2a6829708fa440f16eff6d8103839155c Mon Sep 17 00:00:00 2001 From: Edward Hu Date: Wed, 17 Dec 2025 11:53:21 -0600 Subject: [PATCH] Implement Logistic Regression and XGBoost pipelines This script implements a machine learning pipeline using Logistic Regression and XGBoost to classify violations based on the provided dataset. It includes model training, hyperparameter tuning with RandomizedSearchCV, evaluation metrics, and saving the trained models. --- model_slo.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 model_slo.py diff --git a/model_slo.py b/model_slo.py new file mode 100644 index 0000000..c709a50 --- /dev/null +++ b/model_slo.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, RandomizedSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report +from sklearn.pipeline import Pipeline +from scipy.stats import uniform, randint +from xgboost import XGBClassifier +import joblib + +df = pd.read_csv("expanded_pipeline_data_slo.csv") + +y = df["violate"] +x = df.drop(columns=["violate"]) + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.2, random_state=42 +) + +# Logistic Regression pipeline +linear_pipeline = Pipeline( + [("scaler", StandardScaler()), ("logistic", LogisticRegression(random_state=42))] +) + +linear_pipeline.fit(x_train, y_train) +lr_preds = linear_pipeline.predict(x_test) + +# XGBoost pipeline +xg_pipeline = Pipeline( + [("xgb", XGBClassifier(objective="binary:logistic", random_state=42))] +) + +param_dist = { + "xgb__n_estimators": randint(100, 500), + "xgb__max_depth": randint(3, 15), + "xgb__learning_rate": uniform(0.005, 0.3), + "xgb__subsample": uniform(0.5, 0.5), + "xgb__colsample_bytree": uniform(0.5, 0.5), + "xgb__reg_alpha": uniform(0, 1), + "xgb__reg_lambda": uniform(0, 1), +} + +random_search = RandomizedSearchCV( + xg_pipeline, + param_distributions=param_dist, + n_iter=30, + scoring="f1", + cv=5, + verbose=1, + n_jobs=-1, + random_state=42, +) + +random_search.fit(x_train, y_train) +best_model = random_search.best_estimator_ +xgb_preds = best_model.predict(x_test) + + +def eval_model(y_true, y_pred, model_name): + print(f"Evaluation metrics for {model_name}:") + print(f"Accuracy: {accuracy_score(y_true, y_pred)}") + print(f"Precision: {precision_score(y_true, y_pred)}") + print(f"Recall: {recall_score(y_true, y_pred)}") + print(f"F1 Score: {f1_score(y_true, y_pred)}") + print(f"Classification Report:\n{classification_report(y_true, y_pred)}\n") + + +eval_model(y_test, lr_preds, "Logistic Regression") +eval_model(y_test, xgb_preds, "XGBoost") + +joblib.dump(best_model, "xgb_pipeline.joblib") +joblib.dump(linear_pipeline, "linear_pipeline.joblib")