Skip to content

Commit

Permalink
training the model
Browse files Browse the repository at this point in the history
  • Loading branch information
fatou1526 committed Sep 19, 2023
1 parent 1f24dbe commit 52bed6c
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/main_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ jobs:
- name: Run Python module
run: python src/process.py

- name: Run Python module
run: python src/train.py
33 changes: 30 additions & 3 deletions src/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,31 @@
"""Contient entrainement du modele."""
import pandas as pd
from utils import load_data
from utils import split_dataset
from utils import encoding
from utils import normalize
from utils import training
from utils import evaluate_model

def training():
pass
# Loading cleaned data
data = load_data("C:/Users/USER/Documents/Master2 DIT/Outil versioning/branche1/arborescence_tree/data/cleaned/cleaned_data.csv")

# Splitting data
y = data['label']
X = data.drop('label', axis=1)
X_train, X_test, y_train, y_test = split_dataset(X, y)

# y Label encoding
y_train = encoding(y_train)
y_test = encoding(y_test)

# Normalize X features
X_train = normalize(X_train)
X_test = normalize(X_test)

# Training
y_pred = training(X_train, y_train, X_test)

# Evaluate model
accuracy, report = evaluate_model(y_test, y_pred)
print(f"The accuracy is {accuracy}")
print(f"The report is \n {report}")
33 changes: 32 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def load_data(filepath):
# loading dataset
Expand Down Expand Up @@ -41,7 +46,33 @@ def preprocessing_outliers(data_df):
return data_df


def split_dataset(X, y):
# This method helps to split the data to train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
return X_train, X_test, y_train, y_test


# label encoder
def encoding(label):
le= LabelEncoder()
label =le.fit_transform(label)
return label

# Normalization/Standardisation
def normalize(features):
features = StandardScaler().fit_transform(features)
return features

# Training
def training(X_train, y_train, X_test):
rfc = RandomForestClassifier(n_estimators=100, random_state=24)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
return y_pred

# Evaluate model
def evaluate_model(y_test, y_pred):
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
return accuracy, report


0 comments on commit 52bed6c

Please sign in to comment.