adding Accuracy and RMSE on training for regression model

ShubhamSupekar · Oct 19, 2024 · 6634e30 · 6634e30
1 parent bd5ed12
commit 6634e30
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 45 deletions.
diff --git a/README b/README
@@ -6,4 +6,10 @@ run this command to activate virtual environment
 
 
 uvicorn main:app --reload
-to run webapp run above command 
+to run webapp run above command 
+
+
+
+To calculate Accuracy of Regression model this is the formula:
+
+Accuracy(%) = 100 x [1-(RMSE/Mean of Actual Values)]
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
diff --git a/main.py b/main.py
@@ -98,6 +98,7 @@ async def remove_columns(request: Request, columns_to_remove: List[str] = Form(.
     })
 
 
+# FastAPI endpoint for training
 @app.post("/train")
 async def train_model(request: Request, predictor_column: str = Form(...), dataset: str = Form(...)):
     global df  # Access the global df variable
@@ -119,31 +120,35 @@ async def train_model(request: Request, predictor_column: str = Form(...), datas
     # Train the model using the selected predictor column
     results = lr.StartTraining(predictor_column, df)
 
-    # Prepare the output for each model in a list format
+    # Prepare the output for each model in a list format, including accuracy percentage
     model_results = [
         {
-            "model": result[0],
-            "features": result[1],
-            "r2_score": result[2]
+            "model": result[0],           # Model name
+            "features": result[1],        # Selected features
+            "r2_score": result[2],        # R² score
+            "rmse": result[3],            # RMSE value
+            "accuracy": result[4]         # Accuracy percentage
         } for result in results
     ]
 
-    # Sort model results by R² score in decreasing order
-    model_results.sort(key=lambda x: x['r2_score'], reverse=True)
+    # Sort model results by both R² score (decreasing) and RMSE (increasing)
+    model_results.sort(key=lambda x: (-x['r2_score'], x['rmse']))
 
-    # Find the best model based on R² score (first in sorted list)
+    # Find the best model based on both R² score and prediction accuracy
     best_model_result = model_results[0] if model_results else None
-    
+
     # Prepare a list of models excluding the best model for display
     other_models = model_results[1:] if model_results else []
 
-    # Return the template with the best model and all models
+    # Return the template with the best model, all models, and added accuracy percentage
     return templates.TemplateResponse("index.html", {
         "request": request,
         "predict_column": predictor_column,
         "best_model": best_model_result['model'] if best_model_result else None,  # Best model name
         "best_features": best_model_result['features'] if best_model_result else None,  # Best features
         "best_r2": best_model_result['r2_score'] if best_model_result else None,  # Best R² score
+        "best_rmse": best_model_result['rmse'] if best_model_result else None,  # Best RMSE
+        "best_accuracy": best_model_result['accuracy'] if best_model_result else None,  # Best Accuracy percentage
         "other_models": other_models,  # All other models
         "dataset_used": dataset,  # Show which dataset was used in training
         "selected_dataset": dataset
@@ -152,9 +157,6 @@ async def train_model(request: Request, predictor_column: str = Form(...), datas
 
 
 
-
-
-
 # Model to return results
 class LinearRegressionResult(BaseModel):
     slope: float

diff --git a/model/Regression.py b/model/Regression.py
@@ -1,25 +1,22 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
-from sklearn.preprocessing import PolynomialFeatures
 from sklearn.svm import SVR
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.feature_selection import mutual_info_regression
 from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
-from sklearn.metrics import r2_score
+from sklearn.metrics import r2_score, root_mean_squared_error
 from sklearn.feature_selection import RFE
 from joblib import Parallel, delayed
-
+import numpy as np
 
 
 # Load dataset function
 def LoadDataset(df):
     print("Available columns in the dataset:", df.columns)
     return df.columns
 
-
-
-# Start training with optimized models and RFE for feature selection
+# Start training with optimized models and prediction accuracy
 def StartTraining(target_column, df):
     # Drop rows with missing values
     df = df.dropna()
@@ -32,18 +29,13 @@ def StartTraining(target_column, df):
 
     print(f"\nInitial Features: {filtered_columns}")
 
-    # Perform RFE with multiple models
+    # Perform multiple models and evaluate with 20% prediction accuracy
     results = select_features_with_Mutual_Information(filtered_columns, target_column, df)
 
     return results  # Return the results list
 
-
-
-
 def select_features_with_Mutual_Information(features_list, target, df):
-    results = []  # Store results in a list
-
-    # Available models, excluding PyTorch Neural Network
+    # Available models
     models = {
         'LinearRegression': LinearRegression(),
         'Ridge': Ridge(),
@@ -61,39 +53,40 @@ def select_features_with_Mutual_Information(features_list, target, df):
 
     return results  # Return the list of results
 
-
-
-# Helper function to perform feature selection and evaluate each model
+# Helper function to perform feature selection, train, and evaluate each model
 def evaluate_model_with_feature_importance(model_name, model, df, features, target):
     X = df[features]
     y = df[target]
 
-    # Split the dataset
+    # Split the dataset (80% train, 20% for testing/prediction)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
-    # Train the model for scikit-learn models
+    # Train the model
     model.fit(X_train, y_train)
-    y_pred = model.predict(X_test)
+    y_pred_train = model.predict(X_test)
 
-    # Calculate the R² score
-    r2 = r2_score(y_test, y_pred)
+    # Calculate the R² score on test set
+    r2 = r2_score(y_test, y_pred_train)
 
+    # Use Mutual Information for non-tree-based models
     if model_name in ['RandomForest', 'GradientBoosting']:
-        # Feature importance for tree-based models
         importances = model.feature_importances_
-        selected_features = [features[i] for i in range(len(features)) if importances[i] > 0]  # Use all important features
-
-        # Print feature importance
+        selected_features = [features[i] for i in range(len(features)) if importances[i] > 0]
         print(f"Model: {model_name}, Feature Importances: {importances}")
     else:
-        # Use Mutual Information for non-tree-based models
         mi = mutual_info_regression(X_train, y_train)
-        selected_features = [features[i] for i in range(len(features)) if mi[i] > 0]  # Use features with positive MI
-
-        # Print Mutual Information scores
+        selected_features = [features[i] for i in range(len(features)) if mi[i] > 0]
         print(f"Model: {model_name}, Mutual Information Scores: {mi}")
 
-    # Print the model, selected features, and R² score
-    print(f"Model: {model_name}, Selected Features: {selected_features}, R² score: {r2:.4f}")
+    # Evaluate prediction accuracy with RMSE (Root Mean Squared Error)
+    mse = root_mean_squared_error(y_test, y_pred_train)
+    rmse = np.sqrt(mse)
+
+    # Calculate the accuracy percentage
+    mean_actual = np.mean(y_test)
+    accuracy = 100 * (1 - (rmse / mean_actual))
+
+    # Print model, selected features, R² score, RMSE, and accuracy
+    print(f"Model: {model_name}, Selected Features: {selected_features}, R² score: {r2:.4f}, RMSE: {rmse:.4f}, Accuracy: {accuracy:.2f}%")
 
-    return model_name, selected_features, r2  # Return results
+    return model_name, selected_features, r2, rmse, accuracy  # Return model name, features, R² score, RMSE, and accuracy
diff --git a/model/__pycache__/Regression.cpython-312.pyc b/model/__pycache__/Regression.cpython-312.pyc
diff --git a/templates/index.html b/templates/index.html
@@ -111,6 +111,8 @@ <h4>Best Features:</h4>
                     {% endfor %}
                 </ul>
                 <p><strong>Best R² Value:</strong> {{ best_r2 }}</p>
+                <p><strong>Best Mean Squared Error:</strong> {{ best_rmse }}</p>
+                <p><strong>Best Accuracy:</strong> {{ best_accuracy }} %</p>
             </div>
         {% endif %}
 
@@ -129,6 +131,8 @@ <h5>Features:</h5>
                                 {% endfor %}
                             </ul>
                             <p><strong>R² Value:</strong> {{ result.r2_score }}</p>
+                            <p><strong> Mean Squared Error:</strong> {{ result.rmse }}</p>
+                            <p><strong> Accuracy:</strong> {{ result.accuracy }} %</p>
                         </li>
                     {% endfor %}
                 </ul>