diff --git a/README b/README index 2fb9c08..8122e61 100644 --- a/README +++ b/README @@ -6,4 +6,10 @@ run this command to activate virtual environment uvicorn main:app --reload -to run webapp run above command \ No newline at end of file +to run webapp run above command + + + +To calculate Accuracy of Regression model this is the formula: + +Accuracy(%) = 100 x [1-(RMSE/Mean of Actual Values)] \ No newline at end of file diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc index 77053c2..2ae9b46 100644 Binary files a/__pycache__/main.cpython-312.pyc and b/__pycache__/main.cpython-312.pyc differ diff --git a/main.py b/main.py index ae56a0f..e909c1f 100644 --- a/main.py +++ b/main.py @@ -98,6 +98,7 @@ async def remove_columns(request: Request, columns_to_remove: List[str] = Form(. }) +# FastAPI endpoint for training @app.post("/train") async def train_model(request: Request, predictor_column: str = Form(...), dataset: str = Form(...)): global df # Access the global df variable @@ -119,31 +120,35 @@ async def train_model(request: Request, predictor_column: str = Form(...), datas # Train the model using the selected predictor column results = lr.StartTraining(predictor_column, df) - # Prepare the output for each model in a list format + # Prepare the output for each model in a list format, including accuracy percentage model_results = [ { - "model": result[0], - "features": result[1], - "r2_score": result[2] + "model": result[0], # Model name + "features": result[1], # Selected features + "r2_score": result[2], # R² score + "rmse": result[3], # RMSE value + "accuracy": result[4] # Accuracy percentage } for result in results ] - # Sort model results by R² score in decreasing order - model_results.sort(key=lambda x: x['r2_score'], reverse=True) + # Sort model results by both R² score (decreasing) and RMSE (increasing) + model_results.sort(key=lambda x: (-x['r2_score'], x['rmse'])) - # Find the best model based on R² score (first in sorted list) + # Find the best model based on both R² score and prediction accuracy best_model_result = model_results[0] if model_results else None - + # Prepare a list of models excluding the best model for display other_models = model_results[1:] if model_results else [] - # Return the template with the best model and all models + # Return the template with the best model, all models, and added accuracy percentage return templates.TemplateResponse("index.html", { "request": request, "predict_column": predictor_column, "best_model": best_model_result['model'] if best_model_result else None, # Best model name "best_features": best_model_result['features'] if best_model_result else None, # Best features "best_r2": best_model_result['r2_score'] if best_model_result else None, # Best R² score + "best_rmse": best_model_result['rmse'] if best_model_result else None, # Best RMSE + "best_accuracy": best_model_result['accuracy'] if best_model_result else None, # Best Accuracy percentage "other_models": other_models, # All other models "dataset_used": dataset, # Show which dataset was used in training "selected_dataset": dataset @@ -152,9 +157,6 @@ async def train_model(request: Request, predictor_column: str = Form(...), datas - - - # Model to return results class LinearRegressionResult(BaseModel): slope: float diff --git a/model/Regression.py b/model/Regression.py index f24156a..77b1029 100644 --- a/model/Regression.py +++ b/model/Regression.py @@ -1,15 +1,14 @@ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet -from sklearn.preprocessing import PolynomialFeatures from sklearn.svm import SVR from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import mutual_info_regression from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor -from sklearn.metrics import r2_score +from sklearn.metrics import r2_score, root_mean_squared_error from sklearn.feature_selection import RFE from joblib import Parallel, delayed - +import numpy as np # Load dataset function @@ -17,9 +16,7 @@ def LoadDataset(df): print("Available columns in the dataset:", df.columns) return df.columns - - -# Start training with optimized models and RFE for feature selection +# Start training with optimized models and prediction accuracy def StartTraining(target_column, df): # Drop rows with missing values df = df.dropna() @@ -32,18 +29,13 @@ def StartTraining(target_column, df): print(f"\nInitial Features: {filtered_columns}") - # Perform RFE with multiple models + # Perform multiple models and evaluate with 20% prediction accuracy results = select_features_with_Mutual_Information(filtered_columns, target_column, df) return results # Return the results list - - - def select_features_with_Mutual_Information(features_list, target, df): - results = [] # Store results in a list - - # Available models, excluding PyTorch Neural Network + # Available models models = { 'LinearRegression': LinearRegression(), 'Ridge': Ridge(), @@ -61,39 +53,40 @@ def select_features_with_Mutual_Information(features_list, target, df): return results # Return the list of results - - -# Helper function to perform feature selection and evaluate each model +# Helper function to perform feature selection, train, and evaluate each model def evaluate_model_with_feature_importance(model_name, model, df, features, target): X = df[features] y = df[target] - # Split the dataset + # Split the dataset (80% train, 20% for testing/prediction) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - # Train the model for scikit-learn models + # Train the model model.fit(X_train, y_train) - y_pred = model.predict(X_test) + y_pred_train = model.predict(X_test) - # Calculate the R² score - r2 = r2_score(y_test, y_pred) + # Calculate the R² score on test set + r2 = r2_score(y_test, y_pred_train) + # Use Mutual Information for non-tree-based models if model_name in ['RandomForest', 'GradientBoosting']: - # Feature importance for tree-based models importances = model.feature_importances_ - selected_features = [features[i] for i in range(len(features)) if importances[i] > 0] # Use all important features - - # Print feature importance + selected_features = [features[i] for i in range(len(features)) if importances[i] > 0] print(f"Model: {model_name}, Feature Importances: {importances}") else: - # Use Mutual Information for non-tree-based models mi = mutual_info_regression(X_train, y_train) - selected_features = [features[i] for i in range(len(features)) if mi[i] > 0] # Use features with positive MI - - # Print Mutual Information scores + selected_features = [features[i] for i in range(len(features)) if mi[i] > 0] print(f"Model: {model_name}, Mutual Information Scores: {mi}") - # Print the model, selected features, and R² score - print(f"Model: {model_name}, Selected Features: {selected_features}, R² score: {r2:.4f}") + # Evaluate prediction accuracy with RMSE (Root Mean Squared Error) + mse = root_mean_squared_error(y_test, y_pred_train) + rmse = np.sqrt(mse) + + # Calculate the accuracy percentage + mean_actual = np.mean(y_test) + accuracy = 100 * (1 - (rmse / mean_actual)) + + # Print model, selected features, R² score, RMSE, and accuracy + print(f"Model: {model_name}, Selected Features: {selected_features}, R² score: {r2:.4f}, RMSE: {rmse:.4f}, Accuracy: {accuracy:.2f}%") - return model_name, selected_features, r2 # Return results \ No newline at end of file + return model_name, selected_features, r2, rmse, accuracy # Return model name, features, R² score, RMSE, and accuracy diff --git a/model/__pycache__/Regression.cpython-312.pyc b/model/__pycache__/Regression.cpython-312.pyc index e82222c..8551c6a 100644 Binary files a/model/__pycache__/Regression.cpython-312.pyc and b/model/__pycache__/Regression.cpython-312.pyc differ diff --git a/templates/index.html b/templates/index.html index c6ce4e4..202c242 100644 --- a/templates/index.html +++ b/templates/index.html @@ -111,6 +111,8 @@
Best R² Value: {{ best_r2 }}
+Best Mean Squared Error: {{ best_rmse }}
+Best Accuracy: {{ best_accuracy }} %
{% endif %} @@ -129,6 +131,8 @@R² Value: {{ result.r2_score }}
+Mean Squared Error: {{ result.rmse }}
+Accuracy: {{ result.accuracy }} %
{% endfor %}