diff --git a/model_slo.py b/model_slo.py new file mode 100644 index 0000000..c709a50 --- /dev/null +++ b/model_slo.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, RandomizedSearchCV +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report +from sklearn.pipeline import Pipeline +from scipy.stats import uniform, randint +from xgboost import XGBClassifier +import joblib + +df = pd.read_csv("expanded_pipeline_data_slo.csv") + +y = df["violate"] +x = df.drop(columns=["violate"]) + +x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.2, random_state=42 +) + +# Logistic Regression pipeline +linear_pipeline = Pipeline( + [("scaler", StandardScaler()), ("logistic", LogisticRegression(random_state=42))] +) + +linear_pipeline.fit(x_train, y_train) +lr_preds = linear_pipeline.predict(x_test) + +# XGBoost pipeline +xg_pipeline = Pipeline( + [("xgb", XGBClassifier(objective="binary:logistic", random_state=42))] +) + +param_dist = { + "xgb__n_estimators": randint(100, 500), + "xgb__max_depth": randint(3, 15), + "xgb__learning_rate": uniform(0.005, 0.3), + "xgb__subsample": uniform(0.5, 0.5), + "xgb__colsample_bytree": uniform(0.5, 0.5), + "xgb__reg_alpha": uniform(0, 1), + "xgb__reg_lambda": uniform(0, 1), +} + +random_search = RandomizedSearchCV( + xg_pipeline, + param_distributions=param_dist, + n_iter=30, + scoring="f1", + cv=5, + verbose=1, + n_jobs=-1, + random_state=42, +) + +random_search.fit(x_train, y_train) +best_model = random_search.best_estimator_ +xgb_preds = best_model.predict(x_test) + + +def eval_model(y_true, y_pred, model_name): + print(f"Evaluation metrics for {model_name}:") + print(f"Accuracy: {accuracy_score(y_true, y_pred)}") + print(f"Precision: {precision_score(y_true, y_pred)}") + print(f"Recall: {recall_score(y_true, y_pred)}") + print(f"F1 Score: {f1_score(y_true, y_pred)}") + print(f"Classification Report:\n{classification_report(y_true, y_pred)}\n") + + +eval_model(y_test, lr_preds, "Logistic Regression") +eval_model(y_test, xgb_preds, "XGBoost") + +joblib.dump(best_model, "xgb_pipeline.joblib") +joblib.dump(linear_pipeline, "linear_pipeline.joblib")