test submission

yann1cks · Sep 15, 2024 · 415513f · 415513f
1 parent d11a946
commit 415513f
Showing 5 changed files with 28 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,9 @@
 .ipynb_checkpoints
+.pyc
+__pycache__/
 
 # Data
-data/
+data/
+
+# venv
+.venv/
diff --git a/description.md b/description.md
@@ -1,2 +1,3 @@
 # Description of submission
 
+Test submission
diff --git a/model.joblib b/model.joblib
diff --git a/submission.py b/submission.py
@@ -1,24 +1,22 @@
 """
 This is an example script to generate the outcome variable given the input dataset.
-
-This script should be modified to prepare your own submission that predicts 
+This script should be modified to prepare your own submission that predicts
 the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function.
 
 The predict_outcomes function takes a Pandas data frame. The return value must
 be a data frame with two columns: nomem_encr and outcome. The nomem_encr column
 should contain the nomem_encr column from the input data frame. The outcome
 column should contain the predicted outcome for each nomem_encr. The outcome
 should be 0 (no child) or 1 (having a child).
-
 clean_df should be used to clean (preprocess) the data.
 
 run.py can be used to test your submission.
 """
 
 # List your libraries and modules here. Don't forget to update environment.yml!
 import pandas as pd
-from sklearn.linear_model import LogisticRegression
 import joblib
+#from prefer.data import *
 
 
 def clean_df(df, background_df=None):
@@ -44,8 +42,9 @@ def clean_df(df, background_df=None):
     # Selecting variables for modelling
     keepcols = [
         "nomem_encr",  # ID variable required for predictions,
-        "age"          # newly created variable
-    ] 
+        "age",  # newly created variable
+        "gender_bg",  # <--------ADDED VARIABLE
+    ]
 
     # Keeping data with variables selected
     df = df[keepcols]
@@ -54,8 +53,8 @@ def clean_df(df, background_df=None):
 
 
 def predict_outcomes(df, background_df=None, model_path="model.joblib"):
-    """Generate predictions using the saved model and the input dataframe.
-
+    """
+    Generate predictions using the saved model and the input dataframe.
     The predict_outcomes function accepts a Pandas DataFrame as an argument
     and returns a new DataFrame with two columns: nomem_encr and
     prediction. The nomem_encr column in the new DataFrame replicates the
@@ -75,8 +74,9 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     """
 
     ## This script contains a bare minimum working example
-    if "nomem_encr" not in df.columns:
-        print("The identifier variable 'nomem_encr' should be in the dataset")
+
+    #if "nomem_encr" not in df.columns:
+    #    print("The identifier variable 'nomem_encr' should be in the dataset")
 
     # Load the model
     model = joblib.load(model_path)
@@ -85,15 +85,13 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     df = clean_df(df, background_df)
 
     # Exclude the variable nomem_encr if this variable is NOT in your model
-    vars_without_id = df.columns[df.columns != 'nomem_encr']
+    vars_without_id = df.columns[df.columns != "nomem_encr"]
 
     # Generate predictions from model, should be 0 (no child) or 1 (had child)
     predictions = model.predict(df[vars_without_id])
 
     # Output file should be DataFrame with two columns, nomem_encr and predictions
-    df_predict = pd.DataFrame(
-        {"nomem_encr": df["nomem_encr"], "prediction": predictions}
-    )
+    df_predict = pd.DataFrame({"nomem_encr": df["nomem_encr"], "prediction": predictions})
 
     # Return only dataset with predictions and identifier
     return df_predict
diff --git a/training.py b/training.py
@@ -8,6 +8,10 @@
 number of folds, model, et cetera
 """
 
+import pandas as pd
+from sklearn.linear_model  import LogisticRegression
+import joblib
+
 def train_save_model(cleaned_df, outcome_df):
     """
     Trains a model using the cleaned dataframe and saves the model to a file.
@@ -16,9 +20,10 @@ def train_save_model(cleaned_df, outcome_df):
     cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
     outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
     """
-
+
+
     ## This script contains a bare minimum working example
-    random.seed(1) # not useful here because logistic regression deterministic
+    #random.seed(1) # not useful here because logistic regression deterministic
 
     # Combine cleaned_df and outcome_df
     model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")
@@ -30,7 +35,8 @@ def train_save_model(cleaned_df, outcome_df):
     model = LogisticRegression()
 
     # Fit the model
-    model.fit(model_df[['age']], model_df['new_child'])
+    model.fit(model_df[['age', 'gender_bg']], model_df['new_child']) # <------- ADDED VARIABLE
 
     # Save the model
     joblib.dump(model, "model.joblib")
+