training the model

fatou1526 · Sep 19, 2023 · 52bed6c · 52bed6c
1 parent 1f24dbe
commit 52bed6c
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 4 deletions.
diff --git a/.github/workflows/main_workflow.yml b/.github/workflows/main_workflow.yml
@@ -36,3 +36,5 @@ jobs:
       - name: Run Python module
         run: python src/process.py
 
+      - name: Run Python module
+        run: python src/train.py
diff --git a/src/train.py b/src/train.py
@@ -1,4 +1,31 @@
- """Contient entrainement du modele."""
+import pandas as pd
+from utils import load_data
+from utils import split_dataset
+from utils import encoding
+from utils import normalize
+from utils import training
+from utils import evaluate_model
 
-def training():
-    pass
+# Loading cleaned data
+data = load_data("C:/Users/USER/Documents/Master2 DIT/Outil versioning/branche1/arborescence_tree/data/cleaned/cleaned_data.csv")
+
+# Splitting data
+y = data['label']
+X = data.drop('label', axis=1)
+X_train, X_test, y_train, y_test = split_dataset(X, y)
+
+# y Label encoding
+y_train = encoding(y_train)
+y_test = encoding(y_test)
+
+# Normalize X features
+X_train = normalize(X_train)
+X_test = normalize(X_test)
+
+# Training
+y_pred = training(X_train, y_train, X_test)
+
+# Evaluate model
+accuracy, report = evaluate_model(y_test, y_pred)
+print(f"The accuracy is {accuracy}")
+print(f"The report is \n {report}")
diff --git a/src/utils.py b/src/utils.py
@@ -4,6 +4,11 @@
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report
 
 def load_data(filepath):
     # loading dataset
@@ -41,7 +46,33 @@ def preprocessing_outliers(data_df):
     return data_df
 
 
+def split_dataset(X, y):
+    # This method helps to split the data to train and test datasets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
+    return X_train, X_test, y_train, y_test 
 
-
+# label encoder
+def encoding(label):
+    le= LabelEncoder()
+    label =le.fit_transform(label)
+    return label
+
+# Normalization/Standardisation
+def normalize(features):
+    features = StandardScaler().fit_transform(features)
+    return features
+
+# Training
+def training(X_train, y_train, X_test):    
+    rfc = RandomForestClassifier(n_estimators=100, random_state=24)
+    rfc.fit(X_train, y_train)
+    y_pred = rfc.predict(X_test)
+    return y_pred
+
+# Evaluate model
+def evaluate_model(y_test, y_pred):
+    accuracy = accuracy_score(y_test, y_pred)
+    report = classification_report(y_test, y_pred)
+    return accuracy, report