diff --git a/description.md b/description.md index b82a78d..b893a59 100644 --- a/description.md +++ b/description.md @@ -1,3 +1,6 @@ # Description of submission +The submission uses an ensemble of Gradient Boosting Machines (LightGBM, XGBoost and Sklearn's Histogramm-Based-Boosting) to predict the fertility. The classifiers are only trained on the individuals with an available outcome variable. Variables are selected based the Feature Importance of simple LightGBM and XGBoost models trained repeatedly on a subset of the data. The household variable from the background dataset is used to conduct grouped Train-Test-Splits or Cross-Validation to avoid data leakage per household. Moreover I tried to preprocess the alle features based on the definitions in the codebook and certain heuristics, i.e. all personality variables are defined as continuous variables, missing value indicators are removed, years are reformatted to ages (or time-differences), categorical variables are defined as the respective pandas dtypes. During preprocessing I also removed the free-form-answers. +The feature set used is very large (over 1000 variables), because I have seen minor improvements in prediction quality, but I lacked the time to identify the relevant variables. My goal was to take a data driven approach to feature selection to indentify currently unknown correlates of fertility, but inspecting the selected variables this was not successful. I cannot rule out overfitting, so the large number of variables most likely degrades the performance on the holdout data. +The training and hyperparameter optimization was done with Microsofts FLAML libary. This libary offers so-called flamlized versions of common Machine Learning Classifiers (e.g. LightGBM), which enable zero-shot Hyperparameter Tuning. These Hyperparameters are selected based on characteristics of the dataset, so no expensive Optimization is needed while iterating on the ideal model. Moreover the library offers an easy to use way to optimize Hyperparameters utilizing the information describes above. -Test submission \ No newline at end of file +I tried using Semi-Supervised with SelfTraining or TriTraining (with Disagreement) to utilize the large amounts of missing data, but was not able so reach a better F1-score. Moreover I tried time-shifting the data, which works better but it contradicts my goal of a fully data-driven approach. \ No newline at end of file diff --git a/training.py b/training.py index dd24422..44a4c4f 100644 --- a/training.py +++ b/training.py @@ -1,18 +1,50 @@ """ This is an example script to train your model given the (cleaned) input dataset. -This script will not be run on the holdout data, +This script will not be run on the holdout data, but the resulting model model.joblib will be applied to the holdout data. -It is important to document your training steps here, including seed, +It is important to document your training steps here, including seed, number of folds, model, et cetera """ import pandas as pd -from sklearn.linear_model import LogisticRegression import joblib +import random +from pathlib import Path +from sklearn.model_selection import StratifiedGroupKFold +from flaml import AutoML, tune +import joblib +from sklearn.base import clone + + +def stratified_group_train_test_split( + X, y, group_col: str, n_splits: int, random_state: int | bool = None, drop_group_column=True +) -> tuple: + """ + Wrapper around StratifiedGroupKFold for a single Train-Test-Split with grouping on the household id. + """ + groups = X[group_col].to_list() + + if drop_group_column: + X = X.drop(columns=[group_col]) + + sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=random_state) + + # Get a grouped stratified Train Test Split + train_index, test_index = next(sgkf.split(X, y, groups)) + + X_train, X_test, y_train, y_test = ( + X.reset_index().loc[train_index].set_index("nomem_encr"), + X.reset_index().loc[test_index].set_index("nomem_encr"), + y.reset_index().loc[train_index].set_index("nomem_encr"), + y.reset_index().loc[test_index].set_index("nomem_encr"), + ) -def train_save_model(cleaned_df, outcome_df): + return X_train, X_test, y_train.iloc[:, 0], y_test.iloc[:, 0] + + +def train_save_model(X, y): """ Trains a model using the cleaned dataframe and saves the model to a file. @@ -21,22 +53,75 @@ def train_save_model(cleaned_df, outcome_df): outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv). """ + # This script contains a bare minimum working example + random.seed(123) + + background = pd.read_csv(Path.cwd().parent / "data" / "other_data" / "PreFer_train_background_data.csv") + + # Sort by wave and drop duplicates afterwards. So newest wave for each individual is on top. + nohouse_nomem_map = ( + background.sort_values("wave", ascending=False).drop_duplicates(subset=["nomem_encr"]).set_index("nomem_encr") + ) + + X = X.join(nohouse_nomem_map["nohouse_encr"]) + + X_with_outcome = X[y.notnull()] + y_with_outcome = y[y.notnull()] + + X_train, X_test, y_train, y_test = stratified_group_train_test_split( + X_with_outcome, y_with_outcome, group_col="nohouse_encr", n_splits=4, random_state=SEED, drop_group_column=False + ) + + # Save groups to variables and drop + train_groups = X_train["nohouse_encr"] + test_groups = X_test["nohouse_encr"] + X_train = X_train.drop(columns=["nohouse_encr"]) + X_test = X_test.drop(columns=["nohouse_encr"]) + + # Print X Train and Test Shape + print(f"Train Size: {len(X_train)}") + print(f"Test Size: {len(X_test)}") - ## This script contains a bare minimum working example - #random.seed(1) # not useful here because logistic regression deterministic - - # Combine cleaned_df and outcome_df - model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr") + custom_hp = { + "lgbm": { + "boosting_type": {"domain": "dart"}, + "data_sample_strategy": {"domain": "goss"}, + "class_weight": {"domain": tune.choice(["balanced", None])}, + }, + "xgboost": { + "scale_pos_weight": {"domain": tune.randint(1, 30)}, + }, + "histgb": { + "max_features": {"domain": tune.uniform(0.5, 1)}, + "class_weight": {"domain": tune.choice(["balanced", None])}, + }, + } - # Filter cases for whom the outcome is not available - model_df = model_df[~model_df['new_child'].isna()] - - # Logistic regression model - model = LogisticRegression() + automl = AutoML() - # Fit the model - model.fit(model_df[['age', 'gender_bg']], model_df['new_child']) # <------- ADDED VARIABLE + automl.fit( + X_train, + y_train, + task="classification", + metric="f1", + time_budget=60 * 300, + starting_points="data", + # max_iter=8, + eval_method="cv", + split_type="group", + groups=train_groups, + n_splits=10, + skip_transform=False, + estimator_list=["xgboost", "histgb", "lgbm"], + log_file_name="flaml.log", + ensemble=True, + early_stop=True, + custom_hp=custom_hp, + seed=123, + ) - # Save the model - joblib.dump(model, "model.joblib") + # Retrain model and save + final_model = clone(automl.model) + final_model.fit(X_with_outcome, y_with_outcome) + joblib.dump(automl, "flaml_model.joblib")