From 64e08927f453e6b9fafb8531f27b0d588046adf3 Mon Sep 17 00:00:00 2001 From: midaa1 Date: Thu, 5 Feb 2026 16:32:17 +0200 Subject: [PATCH 1/2] gaze_tracker refactor to prevent data leakage, spliting problems and metrics calculations --- app/services/gaze_tracker.py | 168 ++++++++++++++++------------------- app/services/metrics.py | 15 ++++ 2 files changed, 91 insertions(+), 92 deletions(-) diff --git a/app/services/gaze_tracker.py b/app/services/gaze_tracker.py index 7d1f7ce..3354551 100644 --- a/app/services/gaze_tracker.py +++ b/app/services/gaze_tracker.py @@ -14,13 +14,15 @@ from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import Ridge -from sklearn.pipeline import make_pipeline + # Model imports from sklearn import linear_model from sklearn.svm import SVR from sklearn.cluster import KMeans from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GroupShuffleSplit +import matplotlib.pyplot as plt # Metrics imports from sklearn.metrics import make_scorer @@ -37,6 +39,7 @@ func_presicion_y, func_accuracy_x, func_accuracy_y, + func_total_accuracy, ) from app.services.config import hyperparameters @@ -78,6 +81,37 @@ def squash(v, limit=1.0): """Squash não-linear estilo WebGazer""" return np.tanh(v / limit) +def trian_and_predict(model_name, X_train, y_train, X_test, y_test, label): + """ + Helper to train a model (with or without GridSearchCV) and return predictions. + """ + if ( + model_name == "Linear Regression" + or model_name == "Elastic Net" + or model_name == "Support Vector Regressor" + ): + model = models[model_name] + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + print(f"Score {label}: {r2_score(y_test, y_pred)}") + return y_pred + else: + pipeline = models[model_name] + param_grid = hyperparameters[model_name]["param_grid"] + grid_search = GridSearchCV( + pipeline, + param_grid, + cv=5, + scoring=scoring, + refit="r2", + return_train_score=True, + ) + grid_search.fit(X_train, y_train) + best_model = grid_search.best_estimator_ + y_pred = best_model.predict(X_test) + return y_pred + + def predict(data, k, model_X, model_Y): """ Predicts the gaze coordinates using machine learning models. @@ -91,97 +125,52 @@ def predict(data, k, model_X, model_Y): Returns: dict: A dictionary containing the predicted gaze coordinates, precision, accuracy, and cluster centroids. """ - # Inicialize standard scaler - sc = StandardScaler() + # Load data from csv file and drop unnecessary columns df = pd.read_csv(data) df = df.drop(["screen_height", "screen_width"], axis=1) + print(df.head()) + # Create groups (point_x, point_y) + df["group"] = list(zip(df["point_x"], df["point_y"])) # Data for X axis X_x = df[["left_iris_x", "right_iris_x"]] X_y = df["point_x"] - - # Normalize data using standard scaler and split data into training and testing sets - X_x = sc.fit_transform(X_x) - X_train_x, X_test_x, y_train_x, y_test_x = train_test_split( - X_x, X_y, test_size=0.2, random_state=42 - ) - - if ( - model_X == "Linear Regression" - or model_X == "Elastic Net" - or model_X == "Support Vector Regressor" - ): - model = models[model_X] - - # Fit the model and make predictions - model.fit(X_train_x, y_train_x) - y_pred_x = model.predict(X_test_x) - - else: - pipeline = models[model_X] - param_grid = hyperparameters[model_X]["param_grid"] - - # Initialize GridSearchCV with the pipeline and parameter grid - grid_search = GridSearchCV( - pipeline, - param_grid, - cv=5, - scoring=scoring, - refit="r2", - return_train_score=True, - ) - - # Fit the GridSearchCV to the training data for X - grid_search.fit(X_train_x, y_train_x) - - # Use the best estimator to predict the values and calculate the R2 score - best_model_x = grid_search.best_estimator_ - y_pred_x = best_model_x.predict(X_test_x) - + # groups = df["group"] # Data for Y axis - X_y = df[["left_iris_y", "right_iris_y"]] + X_feature_y = df[["left_iris_y", "right_iris_y"]] y_y = df["point_y"] - - # Normalize data using standard scaler and split data into training and testing sets - X_y = sc.fit_transform(X_y) - X_train_y, X_test_y, y_train_y, y_test_y = train_test_split( - X_y, y_y, test_size=0.2, random_state=42 + # Split data into training and testing sets then Normalize data using standard scaler + ( + X_train_x, X_test_x, + y_train_x, y_test_x, + X_train_y, X_test_y, + y_train_y, y_test_y + )= train_test_split( + X_x, + X_y, + X_feature_y, + y_y, + test_size=0.2, + random_state=42, ) + + # Scaling (fit on train only) + scaler_x = StandardScaler() + X_train_x = scaler_x.fit_transform(X_train_x) + X_test_x = scaler_x.transform(X_test_x) + + y_pred_x = trian_and_predict(model_X, X_train_x, y_train_x, X_test_x, y_test_x, "X") + + # Scaling (fit on train only) + scaler_y = StandardScaler() + X_train_y = scaler_y.fit_transform(X_train_y) + X_test_y = scaler_y.transform(X_test_y) - if ( - model_Y == "Linear Regression" - or model_Y == "Elastic Net" - or model_Y == "Support Vector Regressor" - ): - model = models[model_Y] - - # Fit the model and make predictions - model.fit(X_train_y, y_train_y) - y_pred_y = model.predict(X_test_y) - - else: - pipeline = models[model_Y] - param_grid = hyperparameters[model_Y]["param_grid"] - - # Initialize GridSearchCV with the pipeline and parameter grid - grid_search = GridSearchCV( - pipeline, - param_grid, - cv=5, - scoring=scoring, - refit="r2", - return_train_score=True, - ) - - # Fit the GridSearchCV to the training data for X - grid_search.fit(X_train_y, y_train_y) - - # Use the best estimator to predict the values and calculate the R2 score - best_model_y = grid_search.best_estimator_ - y_pred_y = best_model_y.predict(X_test_y) - + + y_pred_y = trian_and_predict(model_Y, X_train_y, y_train_y, X_test_y, y_test_y, "Y") + # Convert the predictions to a numpy array and apply KMeans clustering data = np.array([y_pred_x, y_pred_y]).T model = KMeans(n_clusters=k, n_init="auto", init="k-means++") @@ -196,25 +185,20 @@ def predict(data, k, model_X, model_Y): } df_data = pd.DataFrame(data) df_data["True XY"] = list(zip(df_data["True X"], df_data["True Y"])) - + # Filter out negative values df_data = df_data[(df_data["Predicted X"] >= 0) & (df_data["Predicted Y"] >= 0)] - # Calculate the precision and accuracy for each + # Calculate the precision and accuracy for each precision_x = df_data.groupby("True XY").apply(func_precision_x) precision_y = df_data.groupby("True XY").apply(func_presicion_y) - # Calculate the average precision and accuracy + # Calculate the average precision precision_xy = (precision_x + precision_y) / 2 - precision_xy = precision_xy / np.mean(precision_xy) - - # Calculate the accuracy for each axis - accuracy_x = df_data.groupby("True XY").apply(func_accuracy_x) - accuracy_y = df_data.groupby("True XY").apply(func_accuracy_y) - - # Calculate the average accuracy - accuracy_xy = (accuracy_x + accuracy_y) / 2 - accuracy_xy = accuracy_xy / np.mean(accuracy_xy) + + # Calculate the average accuracy (eculidian distance) + accuracy_xy = df_data.groupby("True XY").apply(func_total_accuracy) + # Create a dictionary to store the data data = {} diff --git a/app/services/metrics.py b/app/services/metrics.py index af0fd77..5d89d9d 100644 --- a/app/services/metrics.py +++ b/app/services/metrics.py @@ -57,3 +57,18 @@ def func_accuracy_y(group): """ return np.sqrt(np.mean(np.square(group["True Y"] - group["Predicted Y"]))) +def func_total_accuracy(group): + """ + Calculate the total accuracy for the X and Y axes. + + Args: + group (pandas.DataFrame): A group of data containing the predicted and true values for the X and Y axes. + + Returns: + float: The total accuracy value. + """ + distances = np.sqrt( + np.square(group["True X"] - group["Predicted X"]) + + np.square(group["True Y"] - group["Predicted Y"]) + ) + return np.mean(distances) # Returns average error in pixels \ No newline at end of file From f4808d2cec517ad7803f87b260860fdb81dfae93 Mon Sep 17 00:00:00 2001 From: midaa1 Date: Thu, 5 Feb 2026 17:03:42 +0200 Subject: [PATCH 2/2] gaze_tracker refactor to prevent data leakage, spliting problems and metrics calculations --- app/services/metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/services/metrics.py b/app/services/metrics.py index 5d89d9d..0473ecb 100644 --- a/app/services/metrics.py +++ b/app/services/metrics.py @@ -65,7 +65,8 @@ def func_total_accuracy(group): group (pandas.DataFrame): A group of data containing the predicted and true values for the X and Y axes. Returns: - float: The total accuracy value. + float: The total accuracy value(eculidean distance). + """ distances = np.sqrt( np.square(group["True X"] - group["Predicted X"]) +