From 2319b62d2493a2276975b8fbf411d75b210277f5 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 07:02:53 +0600 Subject: [PATCH 01/13] Update requirements.txt and fit script for draft --- model/rfc_fit.py | 19 +++++++++++++++---- requirements.txt | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/model/rfc_fit.py b/model/rfc_fit.py index 745bb4b..a863b02 100644 --- a/model/rfc_fit.py +++ b/model/rfc_fit.py @@ -1,8 +1,17 @@ -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.preprocessing import StandardScaler -from sklearn.metrics import classification_report, confusion_matrix from sklearn.impute import SimpleImputer +from sklearn.metrics import classification_report + +from sklearn.ensemble import RandomForestClassifier,VotingClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from xgboost import XGBClassifier + +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline + +import time import pandas as pd import numpy as np import joblib @@ -105,6 +114,9 @@ def get_window(camps): X_scaled, y, test_size=1/3, shuffle=True, random_state=91, stratify=y ) +##### STAR CHANGING FROM HERE !! + + # Model Development model = RandomForestClassifier(n_estimators=1000,max_depth=None,random_state=91,class_weight="balanced") model.fit(X_train, y_train) @@ -112,7 +124,6 @@ def get_window(camps): # Model Performence Evaluation y_pred = model.predict(X_test) print("Confusion Matrix:") -print(confusion_matrix(y_test, y_pred)) print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=["FALSE POSITIVE","CANDIDATE","CONFIRMED"])) diff --git a/requirements.txt b/requirements.txt index 8d2e3f8..fa878c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ pandas scikit-learn joblib gunicorn +xgboost +imbalanced-learn From 928df3c692be0b8463efc430a602db4d049a813d Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:29:04 +0600 Subject: [PATCH 02/13] Delete old files --- model/rfc_cross_val.py | 114 -------------------------------- model/rfc_fit.py | 144 ----------------------------------------- 2 files changed, 258 deletions(-) delete mode 100644 model/rfc_cross_val.py delete mode 100644 model/rfc_fit.py diff --git a/model/rfc_cross_val.py b/model/rfc_cross_val.py deleted file mode 100644 index 02d7150..0000000 --- a/model/rfc_cross_val.py +++ /dev/null @@ -1,114 +0,0 @@ -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import cross_val_score, StratifiedKFold -from sklearn.preprocessing import StandardScaler -from sklearn.impute import SimpleImputer -import pandas as pd -import numpy as np - -# Loading and processing 1st Dataset (NASA Kepler Objects of Interest) -df_raw = pd.read_csv("data/kepler_data.csv", comment="#") -feature_list = ["koi_disposition","koi_period","koi_time0bk","koi_depth","koi_prad","koi_sma","koi_incl","koi_teq","koi_insol","koi_impact","koi_ror","koi_srho","koi_dor","koi_num_transits"] -df_selected = df_raw[feature_list] - -df_1 = df_selected.copy() - -# Loading and processing 2nd dataset (NASA K2 Objects of Interest) -df_2 = pd.read_csv("data/k2_data.csv",comment="#") - -## Feature Engineering missing column in K2 (koi_num_transits) -#### This part was generated with an AI tool (a LLM service named Grok, URL: https://grok.com) - -# Campaign dates dictionary (BJD) -campaign_dates = { - 0: (2456725.0, 2456805.0), - 1: (2456808.0, 2456891.0), - 2: (2456893.0, 2456975.0), - 3: (2456976.0, 2457064.0), - 4: (2457065.0, 2457159.0), - 5: (2457159.0, 2457246.0), - 6: (2457250.0, 2457338.0), - 7: (2457339.0, 2457420.0), - 8: (2457421.0, 2457530.0), - 9: (2457504.0, 2457579.0), - 10: (2457577.0, 2457653.0), - 11: (2457657.0, 2457732.0), - 12: (2457731.0, 2457819.0), - 13: (2457820.0, 2457900.0), - 14: (2457898.0, 2457942.0), - 15: (2457941.0, 2458022.0), - 16: (2458020.0, 2458074.0), - 17: (2458074.0, 2458176.0), - 18: (2458151.0, 2458201.0), - 19: (2458232.0, 2458348.0) -} - -def get_window(camps): - if pd.isna(camps) or not camps: - return np.nan, np.nan - - camps = str(camps).split(',') if isinstance(camps, str) else camps - - # Filter valid campaign numbers and get start/end times - starts = [] - ends = [] - for c in camps: - try: - camp_num = int(c.strip()) - if camp_num in campaign_dates: - start, end = campaign_dates[camp_num] - starts.append(start) - ends.append(end) - except (ValueError, KeyError): - continue - - return (min(starts) if starts else np.nan, max(ends) if ends else np.nan) - - -df_2['campaigns'] = df_2['k2_campaigns'] -df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(lambda x: pd.Series(get_window(x))) - -# For transit counting (as before) -df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0) -df_2 = df_2[["disposition","pl_orbper","pl_tranmid","pl_trandep","pl_rade","pl_orbsmax","pl_orbincl","pl_eqt","pl_insol","pl_imppar","pl_ratror","pl_dens","pl_ratdor","num_transits"]] - -#### AI written part ends here - -# Concatenating df_1 and df_2 -mapping = {"disposition":"koi_disposition","pl_orbper":"koi_period","pl_tranmid":"koi_time0bk", - "pl_trandep":"koi_depth","pl_rade":"koi_prad","pl_orbsmax":"koi_sma", - "pl_orbincl":"koi_incl","pl_eqt":"koi_teq","pl_insol":"koi_insol","pl_imppar":"koi_impact", - "pl_ratror":"koi_ror","pl_dens":"koi_srho","pl_ratdor":"koi_dor","num_transits":"koi_num_transits" - } -df_2 = df_2.rename(columns=mapping) - -df = pd.concat([df_1,df_2]) -print(df.shape) # Output: (13568, 14) - -# Input-output separation -X = df.iloc[:,1:].to_numpy() -y = df["koi_disposition"].map({"FALSE POSITIVE":0,"CANDIDATE":1,"CONFIRMED":2,"REFUTED":0}).to_numpy() - -# Imputation -imputer = SimpleImputer(strategy="median") -X = imputer.fit_transform(X) - -# Feature scaling -scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) - -# The Model -model = RandomForestClassifier(n_estimators=500,max_depth=None,random_state=91,class_weight="balanced") - -# Performing Cross Validation -kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=89) -score = cross_val_score(model,X_scaled,y,cv=kfold) -print(f"Average = {score.mean()}") -print(f"Full Matrix:\n {score}") - -## Output - -## Average = 0.7502214425969995 -## Full Matrix: -## [0.74907885 0.74907885 0.74797347 0.75340951 0.75156653] \ No newline at end of file diff --git a/model/rfc_fit.py b/model/rfc_fit.py deleted file mode 100644 index a863b02..0000000 --- a/model/rfc_fit.py +++ /dev/null @@ -1,144 +0,0 @@ -from sklearn.model_selection import train_test_split, RandomizedSearchCV -from sklearn.preprocessing import StandardScaler -from sklearn.impute import SimpleImputer -from sklearn.metrics import classification_report - -from sklearn.ensemble import RandomForestClassifier,VotingClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.svm import SVC -from xgboost import XGBClassifier - -from imblearn.over_sampling import SMOTE -from imblearn.pipeline import Pipeline - -import time -import pandas as pd -import numpy as np -import joblib - -# Loading and processing 1st Dataset (NASA Kepler Objects of Interest) -df_raw = pd.read_csv("data/kepler_data.csv", comment="#") -feature_list = ["koi_disposition","koi_period","koi_time0bk","koi_depth","koi_prad","koi_sma","koi_incl","koi_teq","koi_insol","koi_impact","koi_ror","koi_srho","koi_dor","koi_num_transits"] -df_selected = df_raw[feature_list] - -df_1 = df_selected.copy() - -# Loading and processing 2nd dataset (NASA K2 Objects of Interest) -df_2 = pd.read_csv("data/k2_data.csv",comment="#") - -## Feature Engineering missing column in K2 (koi_num_transits) -#### This part was generated with an AI tool (a LLM service named Grok, URL: https://grok.com) - -# Campaign dates dictionary (BJD) -campaign_dates = { - 0: (2456725.0, 2456805.0), - 1: (2456808.0, 2456891.0), - 2: (2456893.0, 2456975.0), - 3: (2456976.0, 2457064.0), - 4: (2457065.0, 2457159.0), - 5: (2457159.0, 2457246.0), - 6: (2457250.0, 2457338.0), - 7: (2457339.0, 2457420.0), - 8: (2457421.0, 2457530.0), - 9: (2457504.0, 2457579.0), - 10: (2457577.0, 2457653.0), - 11: (2457657.0, 2457732.0), - 12: (2457731.0, 2457819.0), - 13: (2457820.0, 2457900.0), - 14: (2457898.0, 2457942.0), - 15: (2457941.0, 2458022.0), - 16: (2458020.0, 2458074.0), - 17: (2458074.0, 2458176.0), - 18: (2458151.0, 2458201.0), - 19: (2458232.0, 2458348.0) -} - -def get_window(camps): - if pd.isna(camps) or not camps: - return np.nan, np.nan - - camps = str(camps).split(',') if isinstance(camps, str) else camps - - # Filter valid campaign numbers and get start/end times - starts = [] - ends = [] - for c in camps: - try: - camp_num = int(c.strip()) - if camp_num in campaign_dates: - start, end = campaign_dates[camp_num] - starts.append(start) - ends.append(end) - except (ValueError, KeyError): - continue - - return (min(starts) if starts else np.nan, max(ends) if ends else np.nan) - - -df_2['campaigns'] = df_2['k2_campaigns'] -df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(lambda x: pd.Series(get_window(x))) - -# For transit counting (as before) -df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0) -df_2 = df_2[["disposition","pl_orbper","pl_tranmid","pl_trandep","pl_rade","pl_orbsmax","pl_orbincl","pl_eqt","pl_insol","pl_imppar","pl_ratror","pl_dens","pl_ratdor","num_transits"]] - -#### AI written part ends here - -# Concatenating df_1 and df_2 -mapping = {"disposition":"koi_disposition","pl_orbper":"koi_period","pl_tranmid":"koi_time0bk", - "pl_trandep":"koi_depth","pl_rade":"koi_prad","pl_orbsmax":"koi_sma", - "pl_orbincl":"koi_incl","pl_eqt":"koi_teq","pl_insol":"koi_insol","pl_imppar":"koi_impact", - "pl_ratror":"koi_ror","pl_dens":"koi_srho","pl_ratdor":"koi_dor","num_transits":"koi_num_transits" - } -df_2 = df_2.rename(columns=mapping) - -df = pd.concat([df_1,df_2]) -print(df.shape) # Output: (13568, 14) - -# Input-output separation -X = df.iloc[:,1:].to_numpy() -y = df["koi_disposition"].map({"FALSE POSITIVE":0,"CANDIDATE":1,"CONFIRMED":2,"REFUTED":0}).to_numpy() - -# Imputation -imputer = SimpleImputer(strategy="median") -X = imputer.fit_transform(X) - -# Feature scaling -scaler = StandardScaler() -X_scaled = scaler.fit_transform(X) - -# Train-test split -X_train, X_test, y_train, y_test = train_test_split( - X_scaled, y, test_size=1/3, shuffle=True, random_state=91, stratify=y -) - -##### STAR CHANGING FROM HERE !! - - -# Model Development -model = RandomForestClassifier(n_estimators=1000,max_depth=None,random_state=91,class_weight="balanced") -model.fit(X_train, y_train) - -# Model Performence Evaluation -y_pred = model.predict(X_test) -print("Confusion Matrix:") -print("\nClassification Report:") -print(classification_report(y_test, y_pred, target_names=["FALSE POSITIVE","CANDIDATE","CONFIRMED"])) - -## Saving the model for pipeline (NOTE: This code should be run only once) -joblib.dump(model, "model.pkl") -joblib.dump(scaler, "scaler.pkl") - -# Model Performence Evaluation Result -# Classification Report: -# precision recall f1-score support - -# FALSE POSITIVE 0.81 0.84 0.82 1718 -# CANDIDATE 0.59 0.47 0.53 1118 -# CONFIRMED 0.77 0.83 0.80 1687 -# -# accuracy 0.75 4523 -# macro avg 0.72 0.72 0.72 4523 -# weighted avg 0.74 0.75 0.74 4523 \ No newline at end of file From 086013c0c9d4f9aea56ec4d76a59aac9aa774baf Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:29:31 +0600 Subject: [PATCH 03/13] Update fit.py --- fit.py | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 fit.py diff --git a/fit.py b/fit.py new file mode 100644 index 0000000..57a105f --- /dev/null +++ b/fit.py @@ -0,0 +1,149 @@ +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer + +from sklearn.ensemble import RandomForestClassifier, StackingClassifier +from sklearn.linear_model import LogisticRegression +from xgboost import XGBClassifier + +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA + +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import Pipeline + +import time +import pandas as pd +import numpy as np +import joblib + +# Loading and processing 1st Dataset (NASA Kepler Objects of Interest) +df_raw = pd.read_csv("data/kepler_data.csv", comment="#") + +feature_list = ["koi_disposition","koi_period","koi_time0bk", + "koi_depth","koi_prad","koi_sma","koi_incl","koi_teq","koi_insol", + "koi_impact","koi_ror","koi_srho","koi_dor","koi_num_transits" +] + +df_selected = df_raw[feature_list] + +df_1 = df_selected.copy() + +# Loading and processing 2nd dataset (NASA K2 Objects of Interest) +df_2 = pd.read_csv("data/k2_data.csv",comment="#") + +## Feature Engineering missing column in K2 (koi_num_transits) +# Campaign dates dictionary (BJD) +campaign_dates = { + 0: (2456725.0, 2456805.0), + 1: (2456808.0, 2456891.0), + 2: (2456893.0, 2456975.0), + 3: (2456976.0, 2457064.0), + 4: (2457065.0, 2457159.0), + 5: (2457159.0, 2457246.0), + 6: (2457250.0, 2457338.0), + 7: (2457339.0, 2457420.0), + 8: (2457421.0, 2457530.0), + 9: (2457504.0, 2457579.0), + 10: (2457577.0, 2457653.0), + 11: (2457657.0, 2457732.0), + 12: (2457731.0, 2457819.0), + 13: (2457820.0, 2457900.0), + 14: (2457898.0, 2457942.0), + 15: (2457941.0, 2458022.0), + 16: (2458020.0, 2458074.0), + 17: (2458074.0, 2458176.0), + 18: (2458151.0, 2458201.0), + 19: (2458232.0, 2458348.0) +} + +def get_window(camps): + if pd.isna(camps) or not camps: + return np.nan, np.nan + + camps = str(camps).split(',') if isinstance(camps, str) else camps + + # Filter valid campaign numbers and get start/end times + starts = [] + ends = [] + for c in camps: + try: + camp_num = int(c.strip()) + if camp_num in campaign_dates: + start, end = campaign_dates[camp_num] + starts.append(start) + ends.append(end) + except (ValueError, KeyError): + continue + + return (min(starts) if starts else np.nan, max(ends) if ends else np.nan) + +df_2['campaigns'] = df_2['k2_campaigns'] +df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(lambda x: pd.Series(get_window(x))) + +# For transit counting (as before) +df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) +df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) +df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0) +df_2 = df_2[["disposition","pl_orbper","pl_tranmid","pl_trandep","pl_rade","pl_orbsmax","pl_orbincl","pl_eqt","pl_insol","pl_imppar","pl_ratror","pl_dens","pl_ratdor","num_transits"]] + +# Concatenating df_1 and df_2 +mapping = {"disposition":"koi_disposition","pl_orbper":"koi_period","pl_tranmid":"koi_time0bk", + "pl_trandep":"koi_depth","pl_rade":"koi_prad","pl_orbsmax":"koi_sma", + "pl_orbincl":"koi_incl","pl_eqt":"koi_teq","pl_insol":"koi_insol","pl_imppar":"koi_impact", + "pl_ratror":"koi_ror","pl_dens":"koi_srho","pl_ratdor":"koi_dor","num_transits":"koi_num_transits" + } +df_2 = df_2.rename(columns=mapping) + +df = pd.concat([df_1,df_2]) + +# Input-output separation +X = df.iloc[:,1:] +column_name = X.columns +X = X.to_numpy() +y = df["koi_disposition"].map({"FALSE POSITIVE":0,"CANDIDATE":1,"CONFIRMED":2,"REFUTED":0}).to_numpy() + +# Train-test split +x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y +) + +# Define Models +rf = RandomForestClassifier(n_estimators=1000,max_depth=None,random_state=542,class_weight="balanced") +xgb = XGBClassifier(n_estimators=1000,max_depth=None,learning_rate=0.5,random_state=9) + +# Define estimators list for stacking +estimators = [ + ("rf",rf), + ("xgb",xgb) +] + +# Define final estimator for stacking +final_estimator = LogisticRegression( + random_state=891,class_weight="balanced",C=0.1,penalty="l2",solver="saga",max_iter=5000 +) + +# Stacking +mv = StackingClassifier( + estimators=estimators,final_estimator=final_estimator, + cv=5,passthrough=True,n_jobs=-1 +) + +# Pipeline +pipe_mv = Pipeline([ + ("impute",SimpleImputer(strategy="mean")), + ("scale",StandardScaler()), + ("smote",SMOTE()), + ("model",mv) +]) + +print("Starting model training. It will take some time, sit tight......") +t1 = time.time() +pipe_mv.fit(x_train,y_train) +t2 = time.time() +print("Model trained successfully") +minutes,seconds = np.divmod(t2-t1,60) +print(f"Time Elapsed: {minutes} M {seconds:.2f} S") + +# Dumping the model in pickle files +joblib.dump(pipe_mv,"models/pipe.pkl") +joblib.dump(column_name,"models/column_names.pkl") \ No newline at end of file From 8a8e330c1f4ceda404750154f91e225057095a4e Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:29:48 +0600 Subject: [PATCH 04/13] Upload final version of research.ipynb --- research.ipynb | 711 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 711 insertions(+) create mode 100644 research.ipynb diff --git a/research.ipynb b/research.ipynb new file mode 100644 index 0000000..6f29b44 --- /dev/null +++ b/research.ipynb @@ -0,0 +1,711 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4424f511", + "metadata": {}, + "source": [ + "# Exoplanet Classifier\n", + "Welcome to my forked version of The Exoplanet Classifier. This project was developed with my teammates from Ontohin 4b for the NASA Space Apps Challenge 2025.\n", + "\n", + "I have used this notebook as a sandbox to test out different approaches to see how we can yield the maximum predictive performance. You will also find some interesting plots too." + ] + }, + { + "cell_type": "markdown", + "id": "9dead9cb", + "metadata": {}, + "source": [ + "## Importing the libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "37dd9f00", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV,learning_curve\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.metrics import classification_report\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier,VotingClassifier,StackingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from xgboost import XGBClassifier\n", + "from lightgbm import LGBMClassifier\n", + "\n", + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "from imblearn.combine import SMOTETomek\n", + "\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import time\n", + "import pandas as pd\n", + "import numpy as np\n", + "import joblib" + ] + }, + { + "cell_type": "markdown", + "id": "36860f5f", + "metadata": {}, + "source": [ + "## Importing and cleaning Data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a0777e94", + "metadata": {}, + "outputs": [], + "source": [ + "# Loading and processing 1st Dataset (NASA Kepler Objects of Interest)\n", + "df_raw = pd.read_csv(\"data/kepler_data.csv\", comment=\"#\")\n", + "\n", + "feature_list = [\"koi_disposition\",\"koi_period\",\"koi_time0bk\",\n", + " \"koi_depth\",\"koi_prad\",\"koi_sma\",\"koi_incl\",\"koi_teq\",\"koi_insol\",\n", + " \"koi_impact\",\"koi_ror\",\"koi_srho\",\"koi_dor\",\"koi_num_transits\"\n", + "]\n", + "\n", + "df_selected = df_raw[feature_list]\n", + "\n", + "df_1 = df_selected.copy()\n", + "\n", + "# Loading and processing 2nd dataset (NASA K2 Objects of Interest)\n", + "df_2 = pd.read_csv(\"data/k2_data.csv\",comment=\"#\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ec57b1b", + "metadata": {}, + "source": [ + "Feature Engineering to construct number of transits in K2 Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "59bebfeb", + "metadata": {}, + "outputs": [], + "source": [ + "## Feature Engineering missing column in K2 (koi_num_transits) \n", + "\n", + "# Campaign dates dictionary (BJD)\n", + "campaign_dates = {\n", + " 0: (2456725.0, 2456805.0),\n", + " 1: (2456808.0, 2456891.0),\n", + "\n", + " 3: (2456976.0, 2457064.0),\n", + " 4: (2457065.0, 2457159.0),\n", + " 5: (2457159.0, 2457246.0),\n", + " 6: (2457250.0, 2457338.0),\n", + " 7: (2457339.0, 2457420.0),\n", + " 8: (2457421.0, 2457530.0),\n", + " 9: (2457504.0, 2457579.0),\n", + " 10: (2457577.0, 2457653.0),\n", + " 11: (2457657.0, 2457732.0),\n", + " 12: (2457731.0, 2457819.0),\n", + " 13: (2457820.0, 2457900.0),\n", + " 14: (2457898.0, 2457942.0),\n", + " 15: (2457941.0, 2458022.0),\n", + " 16: (2458020.0, 2458074.0),\n", + " 17: (2458074.0, 2458176.0),\n", + " 18: (2458151.0, 2458201.0),\n", + " 19: (2458232.0, 2458348.0)\n", + "}\n", + "\n", + "def get_window(camps):\n", + " if pd.isna(camps) or not camps:\n", + " return np.nan, np.nan\n", + " \n", + " camps = str(camps).split(',') if isinstance(camps, str) else camps\n", + " \n", + " # Filter valid campaign numbers and get start/end times\n", + " starts = []\n", + " ends = []\n", + " for c in camps:\n", + " try:\n", + " camp_num = int(c.strip())\n", + " if camp_num in campaign_dates:\n", + " start, end = campaign_dates[camp_num]\n", + " starts.append(start)\n", + " ends.append(end)\n", + " except (ValueError, KeyError):\n", + " continue \n", + " \n", + " return (min(starts) if starts else np.nan, max(ends) if ends else np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c8a88b17", + "metadata": {}, + "outputs": [], + "source": [ + "df_2['campaigns'] = df_2['k2_campaigns']\n", + "df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(lambda x: pd.Series(get_window(x)))\n", + "\n", + "# For transit counting (as before)\n", + "df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])\n", + "df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper'])\n", + "df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0)\n", + "df_2 = df_2[[\"disposition\",\"pl_orbper\",\"pl_tranmid\",\"pl_trandep\",\n", + "\"pl_rade\",\"pl_orbsmax\",\"pl_orbincl\",\"pl_eqt\",\"pl_insol\",\"pl_imppar\",\"pl_ratror\",\"pl_dens\",\n", + "\"pl_ratdor\",\"num_transits\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "30f37767", + "metadata": {}, + "source": [ + "Concatenating df_1 and df_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e2e3b89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(13568, 14)\n" + ] + } + ], + "source": [ + "mapping = {\"disposition\":\"koi_disposition\",\"pl_orbper\":\"koi_period\",\"pl_tranmid\":\"koi_time0bk\",\n", + " \"pl_trandep\":\"koi_depth\",\"pl_rade\":\"koi_prad\",\"pl_orbsmax\":\"koi_sma\",\n", + " \"pl_orbincl\":\"koi_incl\",\"pl_eqt\":\"koi_teq\",\"pl_insol\":\"koi_insol\",\"pl_imppar\":\"koi_impact\",\n", + " \"pl_ratror\":\"koi_ror\",\"pl_dens\":\"koi_srho\",\"pl_ratdor\":\"koi_dor\",\"num_transits\":\"koi_num_transits\"\n", + " }\n", + "df_2 = df_2.rename(columns=mapping)\n", + "\n", + "df = pd.concat([df_1,df_2])\n", + "print(df.shape) # Output: (13568, 14)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2f7f0c9b", + "metadata": {}, + "outputs": [], + "source": [ + "# Input-output separation\n", + "X = df.iloc[:,1:].to_numpy()\n", + "y = df[\"koi_disposition\"].map({\"FALSE POSITIVE\":0,\"CANDIDATE\":1,\"CONFIRMED\":2,\"REFUTED\":0}).to_numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "e7e8f588", + "metadata": {}, + "source": [ + "## ML Tasks" + ] + }, + { + "cell_type": "markdown", + "id": "2f1867cb", + "metadata": {}, + "source": [ + "Train-test split" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "077969ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Train-test split\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "296c37a0", + "metadata": {}, + "source": [ + "### **Trial - 01**: Trying out with different learners\n", + "In this part, I have performed a **GridSearchCV** on multiple classifier algorithms (**RandomForest** and **XGBoost**) to find the best hyperparameter configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ca79b349", + "metadata": {}, + "outputs": [], + "source": [ + "# Define Models\n", + "rf_model_1 = RandomForestClassifier(random_state=93)\n", + "xgb_model_1 = XGBClassifier(random_state = 94)\n", + "\n", + "# Dimensionality Reduction\n", + "pca_1 = PCA(random_state=81)\n", + "lda_1 = LDA(n_components=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "5b70a030", + "metadata": {}, + "outputs": [], + "source": [ + "pipe_1 = Pipeline([\n", + " (\"impute\",SimpleImputer(strategy=\"median\")),\n", + " (\"scale\",StandardScaler()),\n", + " (\"smote\",SMOTE(random_state=11)),\n", + " (\"dimen\",pca_1),\n", + " (\"model\",xgb_model_1)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "13aa6740", + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = [\n", + " { # XGBClassifier, PCA On\n", + " \"model\":[xgb_model_1],\"model__n_estimators\":[500,1000],\n", + " \"model__max_depth\": [10,16,None],\"model__learning_rate\": [0.05,0.1],\n", + " \"dimen\": [pca_1], \"dimen__n_components\": [0.90,0.95]\n", + " },\n", + " { # XGBClassifier, PCA On, SMOTE Off\n", + " \"model\":[xgb_model_1],\"model__n_estimators\":[500,1000],\n", + " \"model__max_depth\": [10,16,None],\"model__learning_rate\": [0.05,0.1],\n", + " \"dimen\": [lda_1], \"smote\":[\"passthrough\"]\n", + " },\n", + " { # RandomForestClassifier, PCA On\n", + " \"model\": [rf_model_1],\"model__n_estimators\":[500,1000], \n", + " \"model__max_depth\": [10,16,None], \"dimen\": [pca_1], \n", + " \"dimen__n_components\": [0.90,0.95]\n", + " },\n", + " { # RandomForestClassifier, LDA On\n", + " \"model\": [rf_model_1], \"model__n_estimators\": [500,1000], \n", + " \"model__max_depth\": [10,16,None], \"dimen\": [lda_1]\n", + " }\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "083032ca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Score = 0.6719750304142318\n", + "Best Configuration = {'dimen': PCA(random_state=81), 'dimen__n_components': 0.95, 'model': RandomForestClassifier(random_state=93), 'model__max_depth': 16, 'model__n_estimators': 500}\n" + ] + } + ], + "source": [ + "rscv_1 = GridSearchCV(\n", + " estimator=pipe_1,param_grid=param_grid,refit=True,cv=4,verbose=1,n_jobs=-1,\n", + ")\n", + "\n", + "rscv_1.fit(x_train,y_train)\n", + "\n", + "config_1 = rscv_1.best_params_\n", + "score_1 = rscv_1.best_score_\n", + "estimator = rscv_1.best_estimator_\n", + "print(f\"Best Score = {score_1}\")\n", + "print(f\"Best Configuration = {config_1}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "7b12e8a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.80 0.75 0.77 1718\n", + " 1 0.48 0.47 0.47 1118\n", + " 2 0.69 0.73 0.71 1687\n", + "\n", + " accuracy 0.68 4523\n", + " macro avg 0.65 0.65 0.65 4523\n", + "weighted avg 0.68 0.68 0.68 4523\n", + "\n" + ] + } + ], + "source": [ + "y_ture = y_test \n", + "y_pred = rscv_1.predict(x_test)\n", + "print(classification_report(y_ture,y_pred))" + ] + }, + { + "cell_type": "markdown", + "id": "91fee85a", + "metadata": {}, + "source": [ + "### **Trial - 02**: Stacking fine-tuned classifiers \n", + "In this trial, I have fine-tuned a RandomForest and a XGBoost classifiers. Then, I stacked those two using **stacking** ensemble technique. This should generate a better generalization result.\n", + "\n", + "**NOTE**: I have kept the max_depth for both of the models None because it actually generates a higher performace. I tested it when I was working with this notebook for the first time. The GridSearchCV has actually failed to fit the configuration where max_dpeth is None. After some internet search, I found out that GridSearchCV doesn't keep None as it is. So, it did not capture performance gain when the max_depth was None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "568c9c50", + "metadata": {}, + "outputs": [], + "source": [ + "# Define Models\n", + "rf = RandomForestClassifier(n_estimators=500,max_depth=None,random_state=542,class_weight=\"balanced\")\n", + "xgb = XGBClassifier(n_estimators=500,max_depth=None,learning_rate=0.05,random_state=9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38faed81", + "metadata": {}, + "outputs": [], + "source": [ + "# List of estimators\n", + "estimators = [\n", + " (\"rf\",rf),\n", + " (\"xgb\",xgb)\n", + "]\n", + "\n", + "# Final estimator for stacking\n", + "final_estimator = LogisticRegression(random_state=891,class_weight=\"balanced\",C=1,penalty=\"l2\",solver=\"lbfgs\")\n", + "\n", + "# Stacking using scikit-learn's implementation\n", + "mv = StackingClassifier(\n", + " estimators=estimators,final_estimator=final_estimator,\n", + " cv=5,passthrough=True,verbose=1\n", + ")\n", + "\n", + "pipe_mv = Pipeline([\n", + " (\"impute\",SimpleImputer(strategy=\"mean\")),\n", + " (\"scale\",StandardScaler()),\n", + " (\"smote\",SMOTE()),\n", + " (\"model\",mv)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "07cb100f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.2s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.1s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.2s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 20.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.7s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.4s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 20.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.3s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.2s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.1s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.7s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.2s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 44.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 20.8s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.0s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.9s finished\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 45.4s finished\n", + "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.\n", + "[Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 4.1s remaining: 6.1s\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 4.2s finished\n", + "[Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 21.0s remaining: 31.5s\n", + "[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 21.2s finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best config\n", + "{'model__final_estimator__solver': 'saga', 'model__final_estimator__penalty': 'l2', 'model__final_estimator__max_iter': 5000, 'model__final_estimator__C': 0.1}\n", + "Best score = 0.7417357656163626\n" + ] + } + ], + "source": [ + "param_grid = [\n", + " {\n", + " \"model__final_estimator__C\": [0.1,1,10], \"model__final_estimator__penalty\": [\"l2\"],\n", + " \"model__final_estimator__solver\": [\"lbfgs\",\"saga\"],\"model__final_estimator__max_iter\": [5000]\n", + " }\n", + "]\n", + "rscv = RandomizedSearchCV(pipe_mv,param_grid,n_iter=4,cv=3,refit=True,n_jobs=2,random_state=85)\n", + "rscv.fit(x_train,y_train)\n", + "print(f\"Best config\\n{rscv.best_params_}\")\n", + "print(f\"Best score = {rscv.best_score_}\")\n", + "estimator = rscv.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "967f3010", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.81 0.81 1718\n", + " 1 0.56 0.55 0.56 1118\n", + " 2 0.79 0.80 0.80 1687\n", + "\n", + " accuracy 0.75 4523\n", + " macro avg 0.72 0.72 0.72 4523\n", + "weighted avg 0.75 0.75 0.75 4523\n", + "\n" + ] + } + ], + "source": [ + "y_true = y_test\n", + "y_pred = rscv.predict(x_test)\n", + "print(classification_report(y_true,y_pred))" + ] + }, + { + "cell_type": "markdown", + "id": "b1f82880", + "metadata": {}, + "source": [ + "As we can see, we have got 0.75 overall accuracy. Unfortunately, we could not improve it much from the original version which we crafted during the hackathon. We have achieved a bit higher recall for class 1. But, it is still not a huge win. \n", + "\n", + "However, it is a drastic improvement from **Trial-01**. This indirectly prooves that GridSearchCV failed to test with `max_depth=None`." + ] + }, + { + "cell_type": "markdown", + "id": "13890590", + "metadata": {}, + "source": [ + "## Visualization" + ] + }, + { + "cell_type": "markdown", + "id": "0886fb1b", + "metadata": {}, + "source": [ + "Plotting **Correlational Heatmap** of the original dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "cce8475d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12,8))\n", + "cor = df.iloc[:,1:].corr()\n", + "sns.heatmap(cor,cmap=\"icefire\",fmt=\".2f\",annot=True)\n", + "plt.title(\"Correlational Heatmap of the feature columns\", fontdict={\"fontsize\":15})\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "7ff5bc57", + "metadata": {}, + "source": [ + "This heatmap clearly shows **high multicollinearity**, especially between `koi_ror` and `koi_impact` (0.99) and `koi_period` and `koi_sma` (0.87)." + ] + }, + { + "cell_type": "markdown", + "id": "db0e7120", + "metadata": {}, + "source": [ + "Plotting **Learning Curve**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21d0418e", + "metadata": {}, + "outputs": [], + "source": [ + "train_sizes, train_sc, val_sc = learning_curve(\n", + " estimator,x_train,y_train,train_sizes=np.linspace(0.1,1.0,10),cv=5,n_jobs=-1,shuffle=True,random_state=59\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "0ea4f14a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,6))\n", + "\n", + "train_mean = np.mean(train_sc,axis=1)\n", + "train_std = np.std(train_sc,axis=1)\n", + "val_mean = np.mean(val_sc,axis=1)\n", + "val_std = np.std(val_sc,axis=1)\n", + "\n", + "plt.plot(\n", + " train_sizes,train_mean, color=\"red\", marker=\"v\", markersize=5, label=\"Training Mean\"\n", + ")\n", + "\n", + "plt.fill_between(\n", + " train_sizes, train_mean-train_std, train_mean+train_std, color=\"red\",alpha=0.3\n", + ")\n", + "\n", + "plt.plot(\n", + " train_sizes,val_mean, color=\"orange\", marker=\"s\", markersize=5, label=\"Validation Mean\"\n", + ")\n", + "\n", + "plt.fill_between(\n", + " train_sizes, val_mean-val_std, train_mean+val_std, color=\"orange\",alpha=0.3\n", + ")\n", + "\n", + "plt.xlabel(\"Training Size\", fontdict={\"fontsize\":14})\n", + "plt.ylabel(\"Accuracy\",fontdict={\"fontsize\":14})\n", + "plt.title(\"Learning Curve\", fontdict={\"fontsize\":14})\n", + "plt.ylim((0.4,1.1))\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3d434fdf", + "metadata": {}, + "source": [ + "Well, the model is **overfitting** because it is **too complex** for the given problem and data. Adding more training data (by extending the x-axis) is unlikely to help significantly, as the validation accuracy has already flattened out." + ] + }, + { + "cell_type": "markdown", + "id": "010ecb7b", + "metadata": {}, + "source": [ + "I have tried everything I could to make it robust. But, I think this data is just too much complex for classification even with complex techniques and algorithms. " + ] + }, + { + "cell_type": "markdown", + "id": "a543cd94", + "metadata": {}, + "source": [ + "## Summary" + ] + }, + { + "cell_type": "markdown", + "id": "c41dc50b", + "metadata": {}, + "source": [ + "The biggest win of the entire fork is that I have initiated a 100 times more **standard** and **robust** way to find out the best possible performance. I have tested with Grid and Randomized searches for tuning the hyperparameters as much as possible, which is paired with ensemble techniques like **VotingClassifier** and **Stacking**. Furthermore, I tried using the multiple times Kaggle-winning algorithm, the **XGBoost** implementation of **Gradient Boosting**. \n", + "\n", + "However, the data was still too much complex for classification. If anyone has any idea, I would love to hear it!\n", + "\n", + "\n", + "Thanks for giving this notebook a look ! I hope you like it." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "JupyterFix (Py3.13)", + "language": "python", + "name": "jupyterfix" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2fd9c9a702d63152110fdfba3b2ca22d50b074ca Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:30:05 +0600 Subject: [PATCH 05/13] Update flask app code --- app.py | 64 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/app.py b/app.py index 98e0199..ef717b3 100644 --- a/app.py +++ b/app.py @@ -1,34 +1,58 @@ -from flask import render_template,request,jsonify,Flask +from flask import Flask, render_template, request, jsonify import numpy as np import joblib +import pandas as pd -model = joblib.load("model.pkl") -scaler = joblib.load("scaler.pkl") +pipe = joblib.load("models/pipe.pkl") # sklearn Pipeline (scaler + model) +column_names = joblib.load("models/column_names.pkl") # list of feature names -reverse_mapping = {0:"FALSE POSITIVE",1:"CANDIDATE",2:"CONFIRMED"} +reverse_mapping = {0: "FALSE POSITIVE", 1: "CANDIDATE", 2: "CONFIRMED"} app = Flask(__name__) @app.route("/") def home(): - return render_template("index.html") - -@app.route("/predict",methods=["POST"]) -def predict(): - try: - data = request.json["features"] - arr = np.array(data).reshape(1,-1) - arr_scaled = scaler.transform(arr) - pred = model.predict(arr_scaled)[0] - proba_pred = model.predict_proba(arr_scaled)[0] - proba_dict = {reverse_mapping[i]: round(p,3) for i,p in enumerate(proba_pred)} - return jsonify({"prediction":reverse_mapping[pred],"probabilities":proba_dict}) - except Exception as e: - return jsonify({"error":e}) + return render_template("index.html") @app.route("/about") def about(): - return render_template("about.html") + return render_template("about.html") + +@app.route("/predict", methods=["POST"]) +def predict(): + try: + raw_features = [ + request.json["orbital-period"], + request.json["transit-epoch"], + request.json["transit-depth"], + request.json["planet-radius"], + request.json["semi-major-axis"], + request.json["inclination"], + request.json["equilibrium-temp"], + request.json["insolation-flux"], + request.json["impact-parameter"], + request.json["radius-ratio"], + request.json["stellar-density"], + request.json["star-distance"], + request.json["num-transits"], + ] + + df = pd.DataFrame([raw_features], columns=column_names) + + pred = int(pipe.predict(df)[0]) + proba = pipe.predict_proba(df)[0] + + proba_dict = { + reverse_mapping[i]: round(p, 3) for i, p in enumerate(proba) + } + + return jsonify( + {"prediction": reverse_mapping[pred], "probabilities": proba_dict} + ) + + except Exception as e: + return jsonify({"error": str(e)}), 400 + if __name__ == "__main__": - app.run(debug=True) \ No newline at end of file + app.run(debug=True) \ No newline at end of file From 909008a9066509e08fd03f9e42e6c0d69f49711d Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:30:33 +0600 Subject: [PATCH 06/13] Update index.html to protect integration --- templates/index.html | 688 ++++++++++++++----------------------------- 1 file changed, 225 insertions(+), 463 deletions(-) diff --git a/templates/index.html b/templates/index.html index e88980b..68241ec 100644 --- a/templates/index.html +++ b/templates/index.html @@ -4,12 +4,15 @@ - Exoplanet Classifier™ + - +
-

+

Int. Space Apps Challenge Presented by Ontohin 4b @@ -35,11 +38,13 @@

Learn more -
+
+
+ src="{{ url_for('static', filename='materials/d821657540b6765c2d915b547bfce9c5 (1).jpg') }}" + alt="">

Welcome, To The

@@ -50,7 +55,8 @@

Exoplanet-Classifier

will classify it under 3 classes: Confirmed planet, Planetary Candidate or False Positive.

-
@@ -59,8 +65,9 @@

Exoplanet-Classifier

-
-
+
+

The
Exoplanet
Classifier @@ -69,9 +76,11 @@

-

Enter the - details

+

+ Enter the details

+
+
@@ -132,14 +141,13 @@

-
@@ -147,55 +155,45 @@

- -
+ + - - \ No newline at end of file + + Date: Fri, 7 Nov 2025 17:30:46 +0600 Subject: [PATCH 07/13] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index fa878c8..a18ea4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ joblib gunicorn xgboost imbalanced-learn +matplotlib \ No newline at end of file From e728a5e5fb4965e475b727858439463862965893 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:30:53 +0600 Subject: [PATCH 08/13] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 287246d..4501521 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,5 @@ scaler.pkl instance/ shit/ *.db -.vscode/ \ No newline at end of file +.vscode/ +*.pkl \ No newline at end of file From 1bead9b72e2a2a46e04bfd9142a5b912c94eea41 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:31:05 +0600 Subject: [PATCH 09/13] Add info.txt in models --- models/info.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 models/info.txt diff --git a/models/info.txt b/models/info.txt new file mode 100644 index 0000000..43ab66d --- /dev/null +++ b/models/info.txt @@ -0,0 +1,2 @@ +.pkl files go here. Run fit.py to train the train the model. + It will automatically create the .pkl files. \ No newline at end of file From aa130b8348a0c166636ca91204a1daac99dee33a Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:54:58 +0600 Subject: [PATCH 10/13] Update code structure in fit.py --- fit.py | 254 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 128 insertions(+), 126 deletions(-) diff --git a/fit.py b/fit.py index 57a105f..cc68ef9 100644 --- a/fit.py +++ b/fit.py @@ -1,70 +1,25 @@ +import time +import pandas as pd +import numpy as np +import joblib + from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer - from sklearn.ensemble import RandomForestClassifier, StackingClassifier from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier - -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA - from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline -import time -import pandas as pd -import numpy as np -import joblib -# Loading and processing 1st Dataset (NASA Kepler Objects of Interest) -df_raw = pd.read_csv("data/kepler_data.csv", comment="#") - -feature_list = ["koi_disposition","koi_period","koi_time0bk", - "koi_depth","koi_prad","koi_sma","koi_incl","koi_teq","koi_insol", - "koi_impact","koi_ror","koi_srho","koi_dor","koi_num_transits" -] - -df_selected = df_raw[feature_list] - -df_1 = df_selected.copy() - -# Loading and processing 2nd dataset (NASA K2 Objects of Interest) -df_2 = pd.read_csv("data/k2_data.csv",comment="#") - -## Feature Engineering missing column in K2 (koi_num_transits) -# Campaign dates dictionary (BJD) -campaign_dates = { - 0: (2456725.0, 2456805.0), - 1: (2456808.0, 2456891.0), - 2: (2456893.0, 2456975.0), - 3: (2456976.0, 2457064.0), - 4: (2457065.0, 2457159.0), - 5: (2457159.0, 2457246.0), - 6: (2457250.0, 2457338.0), - 7: (2457339.0, 2457420.0), - 8: (2457421.0, 2457530.0), - 9: (2457504.0, 2457579.0), - 10: (2457577.0, 2457653.0), - 11: (2457657.0, 2457732.0), - 12: (2457731.0, 2457819.0), - 13: (2457820.0, 2457900.0), - 14: (2457898.0, 2457942.0), - 15: (2457941.0, 2458022.0), - 16: (2458020.0, 2458074.0), - 17: (2458074.0, 2458176.0), - 18: (2458151.0, 2458201.0), - 19: (2458232.0, 2458348.0) -} - -def get_window(camps): +def get_window(camps, campaign_dates): if pd.isna(camps) or not camps: return np.nan, np.nan - + camps = str(camps).split(',') if isinstance(camps, str) else camps - - # Filter valid campaign numbers and get start/end times - starts = [] - ends = [] + starts, ends = [], [] + for c in camps: try: camp_num = int(c.strip()) @@ -73,77 +28,124 @@ def get_window(camps): starts.append(start) ends.append(end) except (ValueError, KeyError): - continue - + continue + return (min(starts) if starts else np.nan, max(ends) if ends else np.nan) -df_2['campaigns'] = df_2['k2_campaigns'] -df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply(lambda x: pd.Series(get_window(x))) - -# For transit counting (as before) -df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) -df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0) -df_2 = df_2[["disposition","pl_orbper","pl_tranmid","pl_trandep","pl_rade","pl_orbsmax","pl_orbincl","pl_eqt","pl_insol","pl_imppar","pl_ratror","pl_dens","pl_ratdor","num_transits"]] - -# Concatenating df_1 and df_2 -mapping = {"disposition":"koi_disposition","pl_orbper":"koi_period","pl_tranmid":"koi_time0bk", - "pl_trandep":"koi_depth","pl_rade":"koi_prad","pl_orbsmax":"koi_sma", - "pl_orbincl":"koi_incl","pl_eqt":"koi_teq","pl_insol":"koi_insol","pl_imppar":"koi_impact", - "pl_ratror":"koi_ror","pl_dens":"koi_srho","pl_ratdor":"koi_dor","num_transits":"koi_num_transits" - } -df_2 = df_2.rename(columns=mapping) - -df = pd.concat([df_1,df_2]) - -# Input-output separation -X = df.iloc[:,1:] -column_name = X.columns -X = X.to_numpy() -y = df["koi_disposition"].map({"FALSE POSITIVE":0,"CANDIDATE":1,"CONFIRMED":2,"REFUTED":0}).to_numpy() - -# Train-test split -x_train, x_test, y_train, y_test = train_test_split( - X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y -) - -# Define Models -rf = RandomForestClassifier(n_estimators=1000,max_depth=None,random_state=542,class_weight="balanced") -xgb = XGBClassifier(n_estimators=1000,max_depth=None,learning_rate=0.5,random_state=9) - -# Define estimators list for stacking -estimators = [ - ("rf",rf), - ("xgb",xgb) -] - -# Define final estimator for stacking -final_estimator = LogisticRegression( - random_state=891,class_weight="balanced",C=0.1,penalty="l2",solver="saga",max_iter=5000 -) - -# Stacking -mv = StackingClassifier( - estimators=estimators,final_estimator=final_estimator, - cv=5,passthrough=True,n_jobs=-1 -) - -# Pipeline -pipe_mv = Pipeline([ - ("impute",SimpleImputer(strategy="mean")), - ("scale",StandardScaler()), - ("smote",SMOTE()), - ("model",mv) -]) - -print("Starting model training. It will take some time, sit tight......") -t1 = time.time() -pipe_mv.fit(x_train,y_train) -t2 = time.time() -print("Model trained successfully") -minutes,seconds = np.divmod(t2-t1,60) -print(f"Time Elapsed: {minutes} M {seconds:.2f} S") - -# Dumping the model in pickle files -joblib.dump(pipe_mv,"models/pipe.pkl") -joblib.dump(column_name,"models/column_names.pkl") \ No newline at end of file + +def load_and_prepare_data(): + # Load Kepler dataset + df_raw = pd.read_csv("data/kepler_data.csv", comment="#") + feature_list = [ + "koi_disposition", "koi_period", "koi_time0bk", "koi_depth", "koi_prad", + "koi_sma", "koi_incl", "koi_teq", "koi_insol", "koi_impact", + "koi_ror", "koi_srho", "koi_dor", "koi_num_transits" + ] + df_1 = df_raw[feature_list].copy() + + # Load K2 dataset + df_2 = pd.read_csv("data/k2_data.csv", comment="#") + + # Define campaign windows + campaign_dates = { + 0: (2456725.0, 2456805.0), 1: (2456808.0, 2456891.0), 2: (2456893.0, 2456975.0), + 3: (2456976.0, 2457064.0), 4: (2457065.0, 2457159.0), 5: (2457159.0, 2457246.0), + 6: (2457250.0, 2457338.0), 7: (2457339.0, 2457420.0), 8: (2457421.0, 2457530.0), + 9: (2457504.0, 2457579.0), 10: (2457577.0, 2457653.0), 11: (2457657.0, 2457732.0), + 12: (2457731.0, 2457819.0), 13: (2457820.0, 2457900.0), 14: (2457898.0, 2457942.0), + 15: (2457941.0, 2458022.0), 16: (2458020.0, 2458074.0), 17: (2458074.0, 2458176.0), + 18: (2458151.0, 2458201.0), 19: (2458232.0, 2458348.0) + } + + # Add observation window + df_2['campaigns'] = df_2['k2_campaigns'] + df_2[['obs_start_bjd', 'obs_end_bjd']] = df_2['campaigns'].apply( + lambda x: pd.Series(get_window(x, campaign_dates)) + ) + + # Transit counting + df_2['n_min'] = np.ceil((df_2['obs_start_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) + df_2['n_max'] = np.floor((df_2['obs_end_bjd'] - df_2['pl_tranmid']) / df_2['pl_orbper']) + df_2['num_transits'] = (df_2['n_max'] - df_2['n_min'] + 1).clip(lower=0) + + # Select and rename columns + df_2 = df_2[ + ["disposition", "pl_orbper", "pl_tranmid", "pl_trandep", "pl_rade", + "pl_orbsmax", "pl_orbincl", "pl_eqt", "pl_insol", "pl_imppar", + "pl_ratror", "pl_dens", "pl_ratdor", "num_transits"] + ] + + mapping = { + "disposition": "koi_disposition", "pl_orbper": "koi_period", "pl_tranmid": "koi_time0bk", + "pl_trandep": "koi_depth", "pl_rade": "koi_prad", "pl_orbsmax": "koi_sma", + "pl_orbincl": "koi_incl", "pl_eqt": "koi_teq", "pl_insol": "koi_insol", + "pl_imppar": "koi_impact", "pl_ratror": "koi_ror", "pl_dens": "koi_srho", + "pl_ratdor": "koi_dor", "num_transits": "koi_num_transits" + } + df_2 = df_2.rename(columns=mapping) + + # Combine both datasets + df = pd.concat([df_1, df_2]) + + # Prepare input/output + X = df.iloc[:, 1:].to_numpy() + y = df["koi_disposition"].map({ + "FALSE POSITIVE": 0, "CANDIDATE": 1, "CONFIRMED": 2, "REFUTED": 0 + }).to_numpy() + + return X, y, df.columns[1:] + + +def build_pipeline(): + rf = RandomForestClassifier( + n_estimators=1000, max_depth=None, random_state=542, class_weight="balanced" + ) + xgb = XGBClassifier( + n_estimators=1000, max_depth=None, learning_rate=0.5, random_state=9 + ) + estimators = [("rf", rf), ("xgb", xgb)] + + final_estimator = LogisticRegression( + random_state=891, class_weight="balanced", C=0.1, + penalty="l2", solver="saga", max_iter=5000 + ) + + mv = StackingClassifier( + estimators=estimators, final_estimator=final_estimator, + cv=5, passthrough=True, n_jobs=-1 + ) + + pipe = Pipeline([ + ("impute", SimpleImputer(strategy="mean")), + ("scale", StandardScaler()), + ("smote", SMOTE()), + ("model", mv) + ]) + return pipe + + +def main(): + X, y, column_name = load_and_prepare_data() + + x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=1/3, shuffle=True, random_state=91, stratify=y + ) + + pipe_mv = build_pipeline() + + print("Starting model training. It will take some time, sit tight......") + t1 = time.time() + pipe_mv.fit(x_train, y_train) + t2 = time.time() + + print("Model trained successfully") + minutes, seconds = np.divmod(t2 - t1, 60) + print(f"Time Elapsed: {minutes:.0f} M {seconds:.2f} S") + + joblib.dump(pipe_mv, "models/pipe.pkl") + joblib.dump(column_name, "models/column_names.pkl") + print("Model and column names saved successfully.") + + +if __name__ == "__main__": + main() From de1c574f8668c020014a4338ba0437865491d2a9 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 17:55:21 +0600 Subject: [PATCH 11/13] Add self-heal functionality in app.py --- app.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/app.py b/app.py index ef717b3..2aa55b7 100644 --- a/app.py +++ b/app.py @@ -1,13 +1,65 @@ -from flask import Flask, render_template, request, jsonify -import numpy as np +import os import joblib import pandas as pd +import numpy as np +from flask import Flask, render_template, request, jsonify +from fit import main -pipe = joblib.load("models/pipe.pkl") # sklearn Pipeline (scaler + model) -column_names = joblib.load("models/column_names.pkl") # list of feature names - +# --- Configuration --- +MODEL_DIR = "models" +PIPE_PATH = os.path.join(MODEL_DIR, "pipe.pkl") +COLUMNS_PATH = os.path.join(MODEL_DIR, "column_names.pkl") reverse_mapping = {0: "FALSE POSITIVE", 1: "CANDIDATE", 2: "CONFIRMED"} +# --- Self-Heal Function --- +def initialize_artifacts(): + """ + Checks if model artifacts exist. If not, runs the training script. + """ + # 1. Ensure the model directory exists + os.makedirs(MODEL_DIR, exist_ok=True) + + # 2. Check for missing files + pipe_exists = os.path.exists(PIPE_PATH) + columns_exists = os.path.exists(COLUMNS_PATH) + + if not pipe_exists or not columns_exists: + print("--- MODEL ARTIFACTS MISSING ---") + if not pipe_exists: + print(f"Missing: {PIPE_PATH}") + if not columns_exists: + print(f"Missing: {COLUMNS_PATH}") + + print("Running training routine (fit.main())... This may take a moment.") + try: + # Run the main training function from fit.py + main() + print("Training complete. Artifacts generated successfully.") + print("---------------------------------") + except Exception as e: + print(f"\nFATAL: Error during self-heal training: {e}") + print("Application cannot start without model artifacts.") + print("Please fix the training script (fit.py) and restart.") + exit(1) # Exit if training fails + else: + print("Model artifacts found. Loading...") + +# --- Application Startup --- + +# Run the self-heal check *before* loading models +initialize_artifacts() + +# Load models +try: + pipe = joblib.load(PIPE_PATH) + column_names = joblib.load(COLUMNS_PATH) + print("Models loaded successfully.") +except Exception as e: + print(f"\nFATAL: Error loading model artifacts: {e}") + print("Files might be corrupt. Try deleting the 'models' directory and restarting.") + exit(1) # Exit if loading fails + +# Initialize Flask App app = Flask(__name__) @app.route("/") @@ -21,6 +73,7 @@ def about(): @app.route("/predict", methods=["POST"]) def predict(): try: + # Extract features from the JSON request raw_features = [ request.json["orbital-period"], request.json["transit-epoch"], @@ -37,20 +90,28 @@ def predict(): request.json["num-transits"], ] + # Create DataFrame with correct column names df = pd.DataFrame([raw_features], columns=column_names) + # Get prediction and probabilities pred = int(pipe.predict(df)[0]) proba = pipe.predict_proba(df)[0] + # Format probabilities for the response proba_dict = { reverse_mapping[i]: round(p, 3) for i, p in enumerate(proba) } + # Send response return jsonify( {"prediction": reverse_mapping[pred], "probabilities": proba_dict} ) + except KeyError as e: + print(f"Prediction Error: Missing key in request {e}") + return jsonify({"error": f"Missing feature in request: {e}"}), 400 except Exception as e: + print(f"Prediction Error: {e}") return jsonify({"error": str(e)}), 400 From 28991071833d8b27713c2faaff3382078cd12d64 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 18:11:57 +0600 Subject: [PATCH 12/13] Update fit.py to show classification report --- fit.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fit.py b/fit.py index cc68ef9..61c0538 100644 --- a/fit.py +++ b/fit.py @@ -11,6 +11,7 @@ from xgboost import XGBClassifier from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline +from sklearn.metrics import classification_report def get_window(camps, campaign_dates): @@ -123,6 +124,10 @@ def build_pipeline(): ]) return pipe +def eval(y_test,x_test,estimator): + y_true = y_test + y_pred = estimator.predict(x_test) + return classification_report(y_true,y_pred) def main(): X, y, column_name = load_and_prepare_data() @@ -142,6 +147,9 @@ def main(): minutes, seconds = np.divmod(t2 - t1, 60) print(f"Time Elapsed: {minutes:.0f} M {seconds:.2f} S") + + print(eval(y_test,x_test,pipe_mv)) + joblib.dump(pipe_mv, "models/pipe.pkl") joblib.dump(column_name, "models/column_names.pkl") print("Model and column names saved successfully.") From 54349cd9fc2b5a804c215643c491916a78f1dd89 Mon Sep 17 00:00:00 2001 From: Sakib Hossain Date: Fri, 7 Nov 2025 18:12:08 +0600 Subject: [PATCH 13/13] Update about.html --- templates/about.html | 186 +------------------------------------------ 1 file changed, 3 insertions(+), 183 deletions(-) diff --git a/templates/about.html b/templates/about.html index b7b2588..3d879bb 100644 --- a/templates/about.html +++ b/templates/about.html @@ -1,177 +1,3 @@ - - @@ -251,14 +77,8 @@

How does it work?

- We have used the widely known Scikit-learn library and its - RandomForestClassifier to build it from the ground up. - It is an ensemble model which works by creating a bunch of decision trees - and training them with slightly different features. - If you want to learn more about it, click - here - to see the official docs. + We have used the widely known Scikit-learn library to build it from the ground up. + More info about the ML part can be found at the README file of the repository.

@@ -378,7 +198,7 @@

Things to Keep in Mind

Have a great day!

- + Click here to visit the GitHub repository.