From 28164b1aa50560bd32a6d068269b52f8ba962575 Mon Sep 17 00:00:00 2001 From: Sarthak Basu Date: Sun, 8 Feb 2026 02:54:24 +0530 Subject: [PATCH 1/2] Fix: Stratified splitting for calibration data --- app/services/gaze_tracker.py | 60 ++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/app/services/gaze_tracker.py b/app/services/gaze_tracker.py index 7d1f7ce..723bcef 100644 --- a/app/services/gaze_tracker.py +++ b/app/services/gaze_tracker.py @@ -92,21 +92,33 @@ def predict(data, k, model_X, model_Y): dict: A dictionary containing the predicted gaze coordinates, precision, accuracy, and cluster centroids. """ # Inicialize standard scaler - sc = StandardScaler() + sc_x = StandardScaler() + sc_y = StandardScaler() # Load data from csv file and drop unnecessary columns df = pd.read_csv(data) df = df.drop(["screen_height", "screen_width"], axis=1) + + # Drop rows with NaN values to prevent sklearn errors + df = df.dropna() - # Data for X axis - X_x = df[["left_iris_x", "right_iris_x"]] - X_y = df["point_x"] - - # Normalize data using standard scaler and split data into training and testing sets - X_x = sc.fit_transform(X_x) - X_train_x, X_test_x, y_train_x, y_test_x = train_test_split( - X_x, X_y, test_size=0.2, random_state=42 + # Create a stratification key based on (point_x, point_y) to ensure all calibration + # points are represented in both training and test sets + df["stratify_key"] = df["point_x"].astype(str) + "_" + df["point_y"].astype(str) + + # Perform a single stratified split to ensure all points are in both train and test sets + train_indices, test_indices = train_test_split( + df.index, test_size=0.2, random_state=42, stratify=df["stratify_key"] ) + + df_train = df.loc[train_indices] + df_test = df.loc[test_indices] + + # Data for X axis + X_x_train = sc_x.fit_transform(df_train[["left_iris_x", "right_iris_x"]]) + X_x_test = sc_x.transform(df_test[["left_iris_x", "right_iris_x"]]) + y_train_x = df_train["point_x"] + y_test_x = df_test["point_x"] if ( model_X == "Linear Regression" @@ -116,8 +128,8 @@ def predict(data, k, model_X, model_Y): model = models[model_X] # Fit the model and make predictions - model.fit(X_train_x, y_train_x) - y_pred_x = model.predict(X_test_x) + model.fit(X_x_train, y_train_x) + y_pred_x = model.predict(X_x_test) else: pipeline = models[model_X] @@ -134,21 +146,17 @@ def predict(data, k, model_X, model_Y): ) # Fit the GridSearchCV to the training data for X - grid_search.fit(X_train_x, y_train_x) + grid_search.fit(X_x_train, y_train_x) # Use the best estimator to predict the values and calculate the R2 score best_model_x = grid_search.best_estimator_ - y_pred_x = best_model_x.predict(X_test_x) + y_pred_x = best_model_x.predict(X_x_test) - # Data for Y axis - X_y = df[["left_iris_y", "right_iris_y"]] - y_y = df["point_y"] - - # Normalize data using standard scaler and split data into training and testing sets - X_y = sc.fit_transform(X_y) - X_train_y, X_test_y, y_train_y, y_test_y = train_test_split( - X_y, y_y, test_size=0.2, random_state=42 - ) + # Data for Y axis (use same train/test split as X for consistency) + X_y_train = sc_y.fit_transform(df_train[["left_iris_y", "right_iris_y"]]) + X_y_test = sc_y.transform(df_test[["left_iris_y", "right_iris_y"]]) + y_train_y = df_train["point_y"] + y_test_y = df_test["point_y"] if ( model_Y == "Linear Regression" @@ -158,8 +166,8 @@ def predict(data, k, model_X, model_Y): model = models[model_Y] # Fit the model and make predictions - model.fit(X_train_y, y_train_y) - y_pred_y = model.predict(X_test_y) + model.fit(X_y_train, y_train_y) + y_pred_y = model.predict(X_y_test) else: pipeline = models[model_Y] @@ -176,11 +184,11 @@ def predict(data, k, model_X, model_Y): ) # Fit the GridSearchCV to the training data for X - grid_search.fit(X_train_y, y_train_y) + grid_search.fit(X_y_train, y_train_y) # Use the best estimator to predict the values and calculate the R2 score best_model_y = grid_search.best_estimator_ - y_pred_y = best_model_y.predict(X_test_y) + y_pred_y = best_model_y.predict(X_y_test) # Convert the predictions to a numpy array and apply KMeans clustering data = np.array([y_pred_x, y_pred_y]).T From ec12e79e44966ca51687c05df4f94767a3bd2ce1 Mon Sep 17 00:00:00 2001 From: Sarthak Basu Date: Mon, 9 Feb 2026 23:36:03 +0530 Subject: [PATCH 2/2] fix: revert variable naming to original convention (X_train_x) --- app/services/gaze_tracker.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/app/services/gaze_tracker.py b/app/services/gaze_tracker.py index 723bcef..df3b494 100644 --- a/app/services/gaze_tracker.py +++ b/app/services/gaze_tracker.py @@ -115,8 +115,8 @@ def predict(data, k, model_X, model_Y): df_test = df.loc[test_indices] # Data for X axis - X_x_train = sc_x.fit_transform(df_train[["left_iris_x", "right_iris_x"]]) - X_x_test = sc_x.transform(df_test[["left_iris_x", "right_iris_x"]]) + X_train_x = sc_x.fit_transform(df_train[["left_iris_x", "right_iris_x"]]) + X_test_x = sc_x.transform(df_test[["left_iris_x", "right_iris_x"]]) y_train_x = df_train["point_x"] y_test_x = df_test["point_x"] @@ -128,8 +128,8 @@ def predict(data, k, model_X, model_Y): model = models[model_X] # Fit the model and make predictions - model.fit(X_x_train, y_train_x) - y_pred_x = model.predict(X_x_test) + model.fit(X_train_x, y_train_x) + y_pred_x = model.predict(X_test_x) else: pipeline = models[model_X] @@ -146,15 +146,15 @@ def predict(data, k, model_X, model_Y): ) # Fit the GridSearchCV to the training data for X - grid_search.fit(X_x_train, y_train_x) + grid_search.fit(X_train_x, y_train_x) # Use the best estimator to predict the values and calculate the R2 score best_model_x = grid_search.best_estimator_ - y_pred_x = best_model_x.predict(X_x_test) + y_pred_x = best_model_x.predict(X_test_x) # Data for Y axis (use same train/test split as X for consistency) - X_y_train = sc_y.fit_transform(df_train[["left_iris_y", "right_iris_y"]]) - X_y_test = sc_y.transform(df_test[["left_iris_y", "right_iris_y"]]) + X_train_y = sc_y.fit_transform(df_train[["left_iris_y", "right_iris_y"]]) + X_test_y = sc_y.transform(df_test[["left_iris_y", "right_iris_y"]]) y_train_y = df_train["point_y"] y_test_y = df_test["point_y"] @@ -166,8 +166,8 @@ def predict(data, k, model_X, model_Y): model = models[model_Y] # Fit the model and make predictions - model.fit(X_y_train, y_train_y) - y_pred_y = model.predict(X_y_test) + model.fit(X_train_y, y_train_y) + y_pred_y = model.predict(X_test_y) else: pipeline = models[model_Y] @@ -184,11 +184,11 @@ def predict(data, k, model_X, model_Y): ) # Fit the GridSearchCV to the training data for X - grid_search.fit(X_y_train, y_train_y) + grid_search.fit(X_train_y, y_train_y) # Use the best estimator to predict the values and calculate the R2 score best_model_y = grid_search.best_estimator_ - y_pred_y = best_model_y.predict(X_y_test) + y_pred_y = best_model_y.predict(X_test_y) # Convert the predictions to a numpy array and apply KMeans clustering data = np.array([y_pred_x, y_pred_y]).T