Skip to content

Commit

Permalink
[ID-30] applied mlflow tracking to the model_training class (#32)
Browse files Browse the repository at this point in the history
bug fix issued by sonar cube
  • Loading branch information
SiddharthaShandilya authored Jul 13, 2023
1 parent f15ad2e commit 893b581
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 42 deletions.
86 changes: 46 additions & 40 deletions src/model_training/model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn.metrics import accuracy_score, roc_auc_score
from params import XGBOOST_HYPER_PARAMS, SVM_HYPER_PARAMS
import logging

import mlflow

class ModelTraining:
"""
Expand All @@ -33,12 +33,12 @@ def best_params_for_xgboost(
estimator=xgbr,
param_distributions=XGBOOST_HYPER_PARAMS,
scoring="neg_mean_squared_error",
n_iter=200,
n_iter=50,
verbose=1,
)
best_xgb_model.fit(fetaure_columns, target_columns)
logging.info(
f"Hyper Parameter tuning for XGBoost is called, best params found is {best_xgb_model.best_params_} \n with best score of {best_xgb_model .best_score_}"
f"Hyper Parameter tuning for XGBoost is called, best params found is {best_xgb_model.best_params_} \n with best score of {best_xgb_model.best_score_}"
)
return best_xgb_model

Expand All @@ -63,7 +63,7 @@ def best_params_for_svm(
estimator=xgbr,
param_distributions=SVM_HYPER_PARAMS,
scoring="neg_mean_squared_error",
n_iter=150,
n_iter=50,
verbose=1,
)
best_svm_model.fit(fetaure_columns, target_columns)
Expand All @@ -87,44 +87,50 @@ def calculate_best_model(
"""
logging.info("calculate_best_model function is called")
try:
xgboost = self.best_params_for_xgboost(feature_train, target_train)
prediction_xgboost = xgboost.best_estimator_.predict(
feature_test
) # Predictions using the XGBoost Model

if (
len(target_test.unique()) == 1
): # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
xgboost_score = accuracy_score(target_test, prediction_xgboost)
logging.info("Accuracy for XGBoost:" + str(xgboost_score)) # Log AUC
else:
xgboost_score = roc_auc_score(
target_test, prediction_xgboost
) # AUC for XGBoost
logging.info("AUC for XGBoost:" + str(xgboost_score)) # Log AUC

# create best model for Random Forest
svm = self.best_params_for_svm(feature_train, target_train)
prediction_svm = svm.best_estimator_.predict(
feature_test
) # prediction using the SVM Algorithm
with mlflow.start_run():
xgboost = self.best_params_for_xgboost(feature_train, target_train)
mlflow.log_params(xgboost.best_params_)
mlflow.log_metric("Best Score for xg_boost on trained data",xgboost.best_score_)
prediction_xgboost = xgboost.best_estimator_.predict(
feature_test
) # Predictions using the XGBoost Model

if (
len(target_test.unique()) == 1
): # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
svm_score = accuracy_score(target_test, prediction_svm)
logging.info("Accuracy for SVM:" + str(svm_score))
else:
svm_score = roc_auc_score(
target_test, prediction_svm
) # AUC for Random Forest
logging.info("AUC for SVM:" + str(svm_score))
if (
len(target_test.unique()) == 1
): # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
xgboost_score = accuracy_score(target_test, prediction_xgboost)
logging.info("Accuracy for XGBoost:" + str(xgboost_score)) # Log AUC
else:
xgboost_score = roc_auc_score(
target_test, prediction_xgboost
) # AUC for XGBoost
logging.info("AUC for XGBoost:" + str(xgboost_score)) # Log AUC
# storign the new score in mlflow
mlflow.log_metric("AUC Score for XG_Boost", xgboost_score)
########### create best model for Random Forest ###########
svm = self.best_params_for_svm(feature_train, target_train)
mlflow.log_params(xgboost.best_params_)
mlflow.log_metric("Best Score for SVM on trained data",xgboost.best_score_)
prediction_svm = svm.best_estimator_.predict(
feature_test
) # prediction using the SVM Algorithm

# comparing the two models
if svm_score < xgboost_score:
return "XGBoost", xgboost.best_estimator_
else:
return "SVM", svm.best_estimator_
if (
len(target_test.unique()) == 1
): # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
svm_score = accuracy_score(target_test, prediction_svm)
logging.info("Accuracy for SVM:" + str(svm_score))
else:
svm_score = roc_auc_score(
target_test, prediction_svm
) # AUC for Random Forest
logging.info("AUC for SVM:" + str(svm_score))
mlflow.log_metric("AUC Score for SVM", svm_score)
# comparing the two models
if svm_score < xgboost_score:
return "XGBoost", xgboost.best_estimator_
else:
return "SVM", svm.best_estimator_

except Exception as e:
logging.info(
Expand Down
4 changes: 2 additions & 2 deletions src/stage_04_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def main():
f" total number of cluster present {combined_clustered_data[CLUSTER_COLUMN_NAME].unique()}"
)
logging.info(
f"Picking all the columns of cluster {values}\n Total data points {combined_clustered_data.shape}"
f"Total data points {combined_clustered_data.shape}"
)
for values in combined_clustered_data[CLUSTER_COLUMN_NAME].unique():
logging.info(f"Model training started for cluster {values}")
Expand Down Expand Up @@ -80,7 +80,7 @@ def main():
)
joblib.dump(best_model, trained_model_filename)
logging.info(
f"Model {best_model_name} succesfully trained for cluster {values} and stroed at {trained_model_filename}"
f"Model {best_model_name} succesfully trained for cluster {values} and stored at {trained_model_filename}"
)


Expand Down

0 comments on commit 893b581

Please sign in to comment.