diff --git a/training.R b/training.R index 08283f1..0cb6a28 100644 --- a/training.R +++ b/training.R @@ -1,11 +1,34 @@ train_save_model <- function(cleaned_df, outcome_df) { - set.seed(1) # not useful here because logistic regression deterministic + # Combine cleaned_df and outcome_df to match on ID model_df <- merge(cleaned_df, outcome_df, by = "nomem_encr") - model <- glm(new_child ~ age + mean_income_imp, family = "binomial", data = model_df) + # glmnet requires matrix, merge turned it into data.frame + model_df <- as.matrix(model_df) + + # features without outcome and identifier + X <- model_df[ , !(colnames(model_df) %in% c("nomem_encr", "new_child"))] + # outcome only + y <- model_df[ , colnames(model_df) == "new_child"] + + # LASSO regression + # cross-validation, to retrieve ideal lambda + # hyperparameter tuning + set.seed(1) + CV <- cv.glmnet(x = X, + y = y, + family = "binomial", + nfolds = 10, standardize = FALSE) + optimal_lambda_test <- CV$lambda.min + + # Run model with optimal lambda + model <- glmnet(x = X, + y = y, + family = "binomial", + lambda = optimal_lambda_test, standardize = FALSE ) # Save the model saveRDS(model, "model.rds") + } \ No newline at end of file