Skip to content

Commit

Permalink
test submission
Browse files Browse the repository at this point in the history
  • Loading branch information
yann1cks committed Sep 15, 2024
1 parent d11a946 commit 415513f
Showing 5 changed files with 28 additions and 18 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
.ipynb_checkpoints
.pyc
__pycache__/

# Data
data/
data/

# venv
.venv/
1 change: 1 addition & 0 deletions description.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# Description of submission

Test submission
Binary file modified model.joblib
Binary file not shown.
26 changes: 12 additions & 14 deletions submission.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,22 @@
"""
This is an example script to generate the outcome variable given the input dataset.
This script should be modified to prepare your own submission that predicts
This script should be modified to prepare your own submission that predicts
the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
The predict_outcomes function takes a Pandas data frame. The return value must
be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
should contain the nomem_encr column from the input data frame. The outcome
column should contain the predicted outcome for each nomem_encr. The outcome
should be 0 (no child) or 1 (having a child).
clean_df should be used to clean (preprocess) the data.
run.py can be used to test your submission.
"""

# List your libraries and modules here. Don't forget to update environment.yml!
import pandas as pd
from sklearn.linear_model import LogisticRegression
import joblib
#from prefer.data import *


def clean_df(df, background_df=None):
@@ -44,8 +42,9 @@ def clean_df(df, background_df=None):
# Selecting variables for modelling
keepcols = [
"nomem_encr", # ID variable required for predictions,
"age" # newly created variable
]
"age", # newly created variable
"gender_bg", # <--------ADDED VARIABLE
]

# Keeping data with variables selected
df = df[keepcols]
@@ -54,8 +53,8 @@ def clean_df(df, background_df=None):


def predict_outcomes(df, background_df=None, model_path="model.joblib"):
"""Generate predictions using the saved model and the input dataframe.
"""
Generate predictions using the saved model and the input dataframe.
The predict_outcomes function accepts a Pandas DataFrame as an argument
and returns a new DataFrame with two columns: nomem_encr and
prediction. The nomem_encr column in the new DataFrame replicates the
@@ -75,8 +74,9 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
"""

## This script contains a bare minimum working example
if "nomem_encr" not in df.columns:
print("The identifier variable 'nomem_encr' should be in the dataset")

#if "nomem_encr" not in df.columns:
# print("The identifier variable 'nomem_encr' should be in the dataset")

# Load the model
model = joblib.load(model_path)
@@ -85,15 +85,13 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
df = clean_df(df, background_df)

# Exclude the variable nomem_encr if this variable is NOT in your model
vars_without_id = df.columns[df.columns != 'nomem_encr']
vars_without_id = df.columns[df.columns != "nomem_encr"]

# Generate predictions from model, should be 0 (no child) or 1 (had child)
predictions = model.predict(df[vars_without_id])

# Output file should be DataFrame with two columns, nomem_encr and predictions
df_predict = pd.DataFrame(
{"nomem_encr": df["nomem_encr"], "prediction": predictions}
)
df_predict = pd.DataFrame({"nomem_encr": df["nomem_encr"], "prediction": predictions})

# Return only dataset with predictions and identifier
return df_predict
12 changes: 9 additions & 3 deletions training.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,10 @@
number of folds, model, et cetera
"""

import pandas as pd
from sklearn.linear_model import LogisticRegression
import joblib

def train_save_model(cleaned_df, outcome_df):
"""
Trains a model using the cleaned dataframe and saves the model to a file.
@@ -16,9 +20,10 @@ def train_save_model(cleaned_df, outcome_df):
cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
"""



## This script contains a bare minimum working example
random.seed(1) # not useful here because logistic regression deterministic
#random.seed(1) # not useful here because logistic regression deterministic

# Combine cleaned_df and outcome_df
model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")
@@ -30,7 +35,8 @@ def train_save_model(cleaned_df, outcome_df):
model = LogisticRegression()

# Fit the model
model.fit(model_df[['age']], model_df['new_child'])
model.fit(model_df[['age', 'gender_bg']], model_df['new_child']) # <------- ADDED VARIABLE

# Save the model
joblib.dump(model, "model.joblib")

0 comments on commit 415513f

Please sign in to comment.