[ID-30] applied mlflow tracking to the model_training class (#32)

bug fix issued by sonar cube
SiddharthaShandilya · Jul 13, 2023 · 893b581 · 893b581
1 parent f15ad2e
commit 893b581
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 42 deletions.
diff --git a/src/model_training/model_training.py b/src/model_training/model_training.py
@@ -6,7 +6,7 @@
 from sklearn.metrics import accuracy_score, roc_auc_score
 from params import XGBOOST_HYPER_PARAMS, SVM_HYPER_PARAMS
 import logging
-
+import mlflow
 
 class ModelTraining:
     """
@@ -33,12 +33,12 @@ def best_params_for_xgboost(
             estimator=xgbr,
             param_distributions=XGBOOST_HYPER_PARAMS,
             scoring="neg_mean_squared_error",
-            n_iter=200,
+            n_iter=50,
             verbose=1,
         )
         best_xgb_model.fit(fetaure_columns, target_columns)
         logging.info(
-            f"Hyper Parameter tuning for XGBoost is called, best params found is {best_xgb_model.best_params_} \n with best score of {best_xgb_model   .best_score_}"
+            f"Hyper Parameter tuning for XGBoost is called, best params found is {best_xgb_model.best_params_} \n with best score of {best_xgb_model.best_score_}"
         )
         return best_xgb_model
 
@@ -63,7 +63,7 @@ def best_params_for_svm(
             estimator=xgbr,
             param_distributions=SVM_HYPER_PARAMS,
             scoring="neg_mean_squared_error",
-            n_iter=150,
+            n_iter=50,
             verbose=1,
         )
         best_svm_model.fit(fetaure_columns, target_columns)
@@ -87,44 +87,50 @@ def calculate_best_model(
         """
         logging.info("calculate_best_model function is called")
         try:
-            xgboost = self.best_params_for_xgboost(feature_train, target_train)
-            prediction_xgboost = xgboost.best_estimator_.predict(
-                feature_test
-            )  # Predictions using the XGBoost Model
-
-            if (
-                len(target_test.unique()) == 1
-            ):  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
-                xgboost_score = accuracy_score(target_test, prediction_xgboost)
-                logging.info("Accuracy for XGBoost:" + str(xgboost_score))  # Log AUC
-            else:
-                xgboost_score = roc_auc_score(
-                    target_test, prediction_xgboost
-                )  # AUC for XGBoost
-                logging.info("AUC for XGBoost:" + str(xgboost_score))  # Log AUC
-
-            # create best model for Random Forest
-            svm = self.best_params_for_svm(feature_train, target_train)
-            prediction_svm = svm.best_estimator_.predict(
-                feature_test
-            )  # prediction using the SVM Algorithm
+            with mlflow.start_run():
+                xgboost = self.best_params_for_xgboost(feature_train, target_train)
+                mlflow.log_params(xgboost.best_params_)
+                mlflow.log_metric("Best Score for xg_boost on trained data",xgboost.best_score_)
+                prediction_xgboost = xgboost.best_estimator_.predict(
+                    feature_test
+                )  # Predictions using the XGBoost Model
 
-            if (
-                len(target_test.unique()) == 1
-            ):  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
-                svm_score = accuracy_score(target_test, prediction_svm)
-                logging.info("Accuracy for SVM:" + str(svm_score))
-            else:
-                svm_score = roc_auc_score(
-                    target_test, prediction_svm
-                )  # AUC for Random Forest
-                logging.info("AUC for SVM:" + str(svm_score))
+                if (
+                    len(target_test.unique()) == 1
+                ):  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
+                    xgboost_score = accuracy_score(target_test, prediction_xgboost)
+                    logging.info("Accuracy for XGBoost:" + str(xgboost_score))  # Log AUC
+                else:
+                    xgboost_score = roc_auc_score(
+                        target_test, prediction_xgboost
+                    )  # AUC for XGBoost
+                    logging.info("AUC for XGBoost:" + str(xgboost_score))  # Log AUC
+                # storign the new score in mlflow
+                mlflow.log_metric("AUC Score for XG_Boost", xgboost_score)
+                ########### create best model for Random Forest ###########
+                svm = self.best_params_for_svm(feature_train, target_train)
+                mlflow.log_params(xgboost.best_params_)
+                mlflow.log_metric("Best Score for SVM on trained data",xgboost.best_score_)
+                prediction_svm = svm.best_estimator_.predict(
+                    feature_test
+                )  # prediction using the SVM Algorithm
 
-            # comparing the two models
-            if svm_score < xgboost_score:
-                return "XGBoost", xgboost.best_estimator_
-            else:
-                return "SVM", svm.best_estimator_
+                if (
+                    len(target_test.unique()) == 1
+                ):  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
+                    svm_score = accuracy_score(target_test, prediction_svm)
+                    logging.info("Accuracy for SVM:" + str(svm_score))
+                else:
+                    svm_score = roc_auc_score(
+                        target_test, prediction_svm
+                    )  # AUC for Random Forest
+                    logging.info("AUC for SVM:" + str(svm_score))
+                mlflow.log_metric("AUC Score for SVM", svm_score)
+                # comparing the two models
+                if svm_score < xgboost_score:
+                    return "XGBoost", xgboost.best_estimator_
+                else:
+                    return "SVM", svm.best_estimator_
 
         except Exception as e:
             logging.info(

diff --git a/src/stage_04_model_training.py b/src/stage_04_model_training.py
@@ -43,7 +43,7 @@ def main():
         f" total number of cluster present {combined_clustered_data[CLUSTER_COLUMN_NAME].unique()}"
     )
     logging.info(
-        f"Picking all the columns of cluster {values}\n Total data points {combined_clustered_data.shape}"
+        f"Total data points {combined_clustered_data.shape}"
     )
     for values in combined_clustered_data[CLUSTER_COLUMN_NAME].unique():
         logging.info(f"Model training started for cluster {values}")
@@ -80,7 +80,7 @@ def main():
         )
         joblib.dump(best_model, trained_model_filename)
         logging.info(
-            f"Model {best_model_name} succesfully trained for cluster {values} and stroed at {trained_model_filename}"
+            f"Model {best_model_name} succesfully trained for cluster {values} and stored at {trained_model_filename}"
         )