Skip to content

Commit

Permalink
Some config
Browse files Browse the repository at this point in the history
  • Loading branch information
infini11 committed Oct 6, 2022
1 parent 85de392 commit 3c6c8fd
Show file tree
Hide file tree
Showing 7 changed files with 26,601 additions and 25 deletions.
5 changes: 2 additions & 3 deletions dags/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

ML_DATASET_OUTPUT_FOLDER = "/opt/airflow/output"
AIRFLOW_PREFIX_TO_DATA = '/opt/airflow/data/'
MLRUNS_DIR = '/mlruns'

TRAIN_DATA = os.path.join(AIRFLOW_PREFIX_TO_DATA, "train/df_ml_train.csv")
TEST_DATA = os.path.join(AIRFLOW_PREFIX_TO_DATA , "test/df_ml_test.csv")
Expand Down Expand Up @@ -57,6 +58,4 @@
'max_features': ['auto'],
'n_estimators': np.arange(10, 20, 2)}
}
}

MLRUNS_DIR = f'{os.getcwd()}/mlruns'
}
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ x-airflow-common:
- ./output:/opt/airflow/output
- ./models:/opt/airflow/models
- ./src:/opt/airflow/src
- ./mlruns:/mlruns
- ./great_expectations:/opt/airflow/great_expectations
user: "${AIRFLOW_UID}:0"
depends_on:
Expand Down
9,282 changes: 9,282 additions & 0 deletions output/ml_test.csv

Large diffs are not rendered by default.

17,298 changes: 17,298 additions & 0 deletions output/ml_train.csv

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ click==8.0.1
ecg-qc==1.0b5
great-expectations==0.13.25
hrv-analysis==1.0.4
mlflow==1.28
mlflow==1.23.1
numpy==1.19.5
pandas==1.1.5
seaborn==0.11.2
psycopg2-binary==2.8.6
py-ecg-detectors==1.0.2
pyEDFlib==0.1.22
Expand Down
Empty file.
37 changes: 16 additions & 21 deletions src/usecase/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
import mlflow
import matplotlib.pyplot as plt
import os
import sys
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score,\
Expand All @@ -36,13 +36,6 @@
from src.usecase.utilities import convert_args_to_dict

TRACKING_URI = 'http://mlflow:5000'
# MODEL_PARAM = {
# 'model': RandomForestClassifier(),
# 'grid_parameters': {
# 'min_samples_leaf': np.arange(1, 5, 1),
# 'max_depth': np.arange(1, 7, 1),
# 'max_features': ['auto'],
# 'n_estimators': np.arange(10, 20, 2)}}
MODEL_PARAM = {
'model': xgb.XGBClassifier(),
'grid_parameters': {
Expand Down Expand Up @@ -112,8 +105,8 @@ def compute_metrics(prefix: str,
print('cannot compute ROC_AUC_score')

try:
titles_options = [(f'{prefix} - Confusion Matrix', None),
(f'{prefix} - Normalized Confusion Matrix', 'true')]
titles_options = [(f'{prefix}-Confusion Matrix', None),
(f'{prefix}-Normalized Confusion Matrix', 'true')]
for title, normalize in titles_options:

if normalize is None:
Expand All @@ -127,12 +120,12 @@ def compute_metrics(prefix: str,
disp.ax_.set_title(title)
temp_name = f'{mlruns_dir}/{title}.png'
plt.savefig(temp_name)
mlflow.log_artifact(temp_name, "confusion-matrix-plots")
mlflow.log_artifact(temp_name)

if total_seconds is not None:
titles_options = [
(f'{prefix} - Confusion Matrix Minutes', None, 'minutes'),
(f'{prefix} - Confusion Matrix Seconds', None, 'seconds')]
(f'{prefix}-Confusion Matrix Minutes', None, 'minutes'),
(f'{prefix}-Confusion Matrix Seconds', None, 'seconds')]

for title, normalize, time_unit in titles_options:

Expand All @@ -149,7 +142,7 @@ def compute_metrics(prefix: str,
disp.ax_.set_title(title)
temp_name = f'{mlruns_dir}/{title}.png'
plt.savefig(temp_name)
mlflow.log_artifact(temp_name, "confusion-matrix-plots")
mlflow.log_artifact(temp_name)

except ValueError:
print('cannot generate confusion matrices')
Expand Down Expand Up @@ -253,6 +246,9 @@ def train_pipeline_with_io(ml_dataset_cleaned_path: str,
df_ml = pd.read_csv(ml_dataset_cleaned_path)
df_ml_test = pd.read_csv(ml_dataset_path_cleaned_test)

df_ml = clean_ml_dataset(df_ml)
df_ml_test = clean_ml_dataset(df_ml_test)

df_ml['patient_id'] = df_ml['filename'].apply(extract_patient_id)
df_ml_test['patient_id'] = df_ml_test['filename'].apply(extract_patient_id)

Expand Down Expand Up @@ -322,7 +318,7 @@ def train_model(

mlflow.set_tracking_uri(tracking_uri)
with mlflow.start_run():

print(mlflow.get_artifact_uri())

feature_names = []

Expand Down Expand Up @@ -378,11 +374,10 @@ def train_model(
y_test_pred = grid_search.predict(X_test)

# Model and performance logging
mlflow.sklearn.log_model(grid_search, 'model')
mlflow.sklearn.log_model(grid_search, 'xgboost')

mlflow.log_param('best_param', grid_search.best_params_)
mlflow.log_param("ID-Patient", 18)
mlflow.log_param("Description", "RandomForest model pour patient {18}")
mlflow.log_param("Description", "Xgboost model sur les patients {22, 34, 39, 45}")
# mlflow.log_param('algorith', 'rfc')

compute_metrics('train',
Expand All @@ -398,8 +393,8 @@ def train_model(
mlruns_dir=mlruns_dir)

# log features importances
plot_feature_importance(grid_search.best_estimator_.feature_importances_,
feature_names, "RandomForest ", mlruns_dir)
# plot_feature_importance(grid_search.best_estimator_.feature_importances_,
# feature_names, "RandomForest ", mlruns_dir)



Expand Down

0 comments on commit 3c6c8fd

Please sign in to comment.