From 4788ba9d1e0e4a6096017b6404aac2f7863402d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jill-J=C3=AAnn=20Vie?= <vie@jill-jenn.net>
Date: Fri, 5 Apr 2024 17:50:33 +0200
Subject: [PATCH] Fix folds in metrics

---
 eval_metrics.py                               |  25 +-
 lr.py                                         |   1 +
 notebooks/Prepare Assistments 2009 full.ipynb | 491 ++++++++++++++++++
 sktm.py                                       |  24 +-
 4 files changed, 525 insertions(+), 16 deletions(-)
 create mode 100644 notebooks/Prepare Assistments 2009 full.ipynb
diff --git a/eval_metrics.py b/eval_metrics.py
index 37a6fbf..087e23e 100644
--- a/eval_metrics.py
+++ b/eval_metrics.py
@@ -51,7 +51,7 @@ def all_metrics(results, test):
         print('This is not the right fold', len(y), len(test))
         sys.exit(0)
 
-    for user, pred, true in zip(test['user_id'], y_pred, y):
+    for user, pred, true in zip(test['user'], y_pred, y):
         predictions_per_user[user]['pred'].append(pred)
         predictions_per_user[user]['y'].append(true)
 
@@ -135,6 +135,7 @@ def all_metrics(results, test):
     val = 0
     for subgroup, auc, nb in zip(attr_ids, metrics_per_sensitive_attr['auc'], nb_samples):
         candidates[subgroup] = (-auc, -nb)
+    print(len(candidates), 'groups and ', test[SENSITIVE_ATTR].nunique(), 'schools in test')
 
     x = []
     nb = []
@@ -145,6 +146,9 @@ def all_metrics(results, test):
         nb.append(-yi)
         val += 1
     plt.stem(x, nb, use_line_collection=True)
+    plt.xlabel('AUC value')
+    plt.ylabel('Number of samples in group')
+    plt.title('For each group, number of samples per AUC value')
     plt.show()
 
     # Display ids of the subgroups (sensitive attribute) that have the lowest/highest AUC
@@ -173,29 +177,20 @@ def all_metrics(results, test):
     plt.show()
     
 if __name__ == '__main__':
-    os.chdir('data/assistments09')
+    os.chdir('data/assistments2009full')
     # os.chdir('data/fr_en')
 
-    # indices = np.load('folds/weak278607fold0.npy')
-    # indices = np.load('folds/278607fold0.npy')
-    # indices = np.load('folds/50weak278607fold0.npy')
-    # indices = np.load('folds/weak926646fold0.npy')
-    # indices = np.load('folds/1199731fold0.npy')
-    # indices = np.load('folds/50weak341791fold0.npy')
-    indices = np.load('folds/50weak341791fold0.npy')
-    print(len(indices))
-
-    df = pd.read_csv('needed.csv')
-    test = df.iloc[indices]
+    df = pd.read_csv('data.csv')
 
     # r = re.compile(r'results-(.*).json')
 
     # ndcg_ = defaultdict(list)
-    for filename in sorted(glob.glob('results*2020*'))[::-1][:1]:
+    for filename in sorted(glob.glob('results*2024*'))[::-1][:1]:
 
         print(filename)
         
         with open(filename) as f:
             results = json.load(f)
 
-        all_metrics(results, test)
+        i_test = results['predictions'][0]['i_test']
+        all_metrics(results, df.iloc[i_test])
diff --git a/lr.py b/lr.py
index 7e476c3..db3d37e 100644
--- a/lr.py
+++ b/lr.py
@@ -71,6 +71,7 @@
         if dataset == 'Test':
             predictions.append({
                 'fold': i,
+                'i_test': i_test.tolist(),
                 'pred': y_pred.tolist(),
                 'y': y.tolist()
             })
diff --git a/notebooks/Prepare Assistments 2009 full.ipynb b/notebooks/Prepare Assistments 2009 full.ipynb
new file mode 100644
index 0000000..a87483a
--- /dev/null
+++ b/notebooks/Prepare Assistments 2009 full.ipynb	
@@ -0,0 +1,491 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "689473a8",
+   "metadata": {},
+   "source": [
+    "fulldata.csv can be downloaded from \"ASSISTments 2009-2010 Full Data set\"  \n",
+    "at https://sites.google.com/site/assistmentsdata/home/2009-2010-assistment-data?authuser=0\n",
+    "\n",
+    "We should check for duplicate lines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d273b8fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>order_id</th>\n",
+       "      <th>assignment_id</th>\n",
+       "      <th>user_id</th>\n",
+       "      <th>assistment_id</th>\n",
+       "      <th>problem_id</th>\n",
+       "      <th>original</th>\n",
+       "      <th>correct</th>\n",
+       "      <th>attempt_count</th>\n",
+       "      <th>ms_first_response_time</th>\n",
+       "      <th>tutor_mode</th>\n",
+       "      <th>answer_type</th>\n",
+       "      <th>sequence_id</th>\n",
+       "      <th>student_class_id</th>\n",
+       "      <th>position</th>\n",
+       "      <th>problem_set_type</th>\n",
+       "      <th>base_sequence_id</th>\n",
+       "      <th>list_skill_ids</th>\n",
+       "      <th>list_skills</th>\n",
+       "      <th>teacher_id</th>\n",
+       "      <th>school_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>20224085</td>\n",
+       "      <td>232368</td>\n",
+       "      <td>73963</td>\n",
+       "      <td>42904</td>\n",
+       "      <td>76429</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>106016.0</td>\n",
+       "      <td>tutor</td>\n",
+       "      <td>choose_1</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>11816.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>MasterySection</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>22763.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20224095</td>\n",
+       "      <td>232368</td>\n",
+       "      <td>73963</td>\n",
+       "      <td>42904</td>\n",
+       "      <td>76430</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>194187.0</td>\n",
+       "      <td>tutor</td>\n",
+       "      <td>choose_1</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>11816.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>MasterySection</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>22763.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>20224113</td>\n",
+       "      <td>232368</td>\n",
+       "      <td>73963</td>\n",
+       "      <td>42904</td>\n",
+       "      <td>76431</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>12734.0</td>\n",
+       "      <td>tutor</td>\n",
+       "      <td>algebra</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>11816.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>MasterySection</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>22763.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>20224123</td>\n",
+       "      <td>232368</td>\n",
+       "      <td>73963</td>\n",
+       "      <td>42904</td>\n",
+       "      <td>76432</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>333484.0</td>\n",
+       "      <td>tutor</td>\n",
+       "      <td>choose_1</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>11816.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>MasterySection</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>22763.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20224142</td>\n",
+       "      <td>232368</td>\n",
+       "      <td>73963</td>\n",
+       "      <td>42904</td>\n",
+       "      <td>76433</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>52828.0</td>\n",
+       "      <td>tutor</td>\n",
+       "      <td>algebra</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>11816.0</td>\n",
+       "      <td>93.0</td>\n",
+       "      <td>MasterySection</td>\n",
+       "      <td>6272.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>22763.0</td>\n",
+       "      <td>73.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   order_id  assignment_id  user_id  assistment_id  problem_id  original  \\\n",
+       "0  20224085         232368    73963          42904       76429         0   \n",
+       "1  20224095         232368    73963          42904       76430         0   \n",
+       "2  20224113         232368    73963          42904       76431         0   \n",
+       "3  20224123         232368    73963          42904       76432         0   \n",
+       "4  20224142         232368    73963          42904       76433         0   \n",
+       "\n",
+       "   correct  attempt_count  ms_first_response_time tutor_mode answer_type  \\\n",
+       "0      0.0              3                106016.0      tutor    choose_1   \n",
+       "1      1.0              1                194187.0      tutor    choose_1   \n",
+       "2      1.0              1                 12734.0      tutor     algebra   \n",
+       "3      1.0              1                333484.0      tutor    choose_1   \n",
+       "4      0.0              2                 52828.0      tutor     algebra   \n",
+       "\n",
+       "   sequence_id  student_class_id  position problem_set_type  base_sequence_id  \\\n",
+       "0       6272.0           11816.0      93.0   MasterySection            6272.0   \n",
+       "1       6272.0           11816.0      93.0   MasterySection            6272.0   \n",
+       "2       6272.0           11816.0      93.0   MasterySection            6272.0   \n",
+       "3       6272.0           11816.0      93.0   MasterySection            6272.0   \n",
+       "4       6272.0           11816.0      93.0   MasterySection            6272.0   \n",
+       "\n",
+       "  list_skill_ids list_skills  teacher_id  school_id  \n",
+       "0            NaN         NaN     22763.0       73.0  \n",
+       "1            NaN         NaN     22763.0       73.0  \n",
+       "2            NaN         NaN     22763.0       73.0  \n",
+       "3            NaN         NaN     22763.0       73.0  \n",
+       "4            NaN         NaN     22763.0       73.0  "
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('data/assistments2009full/fulldata.csv')\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c6228adb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "108"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['school_id'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4b338b5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "list_skills\n",
+       "Addition and Subtraction Integers                                                                                                                           24161\n",
+       "Conversion of Fraction Decimals Percents                                                                                                                    23179\n",
+       "Proportion                                                                                                                                                  22099\n",
+       "Addition and Subtraction Fractions                                                                                                                          16232\n",
+       "Multiplication and Division Integers                                                                                                                        13363\n",
+       "                                                                                                                                                            ...  \n",
+       "Addition and Subtraction Integers;Table                                                                                                                         1\n",
+       "Addition and Subtraction Positive Decimals;Multiplication and Division Positive Decimals;Perimeter of a Polygon;Pythagorean Theorem;Rounding;Square Root        1\n",
+       "Estimation;Multiplication and Division Positive Decimals;Rounding                                                                                               1\n",
+       "Addition and Subtraction Positive Decimals;Table                                                                                                                1\n",
+       "Ordering Fractions;Table                                                                                                                                        1\n",
+       "Name: count, Length: 337, dtype: int64"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['list_skills'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "014920c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "skill\n",
+       "0      577989\n",
+       "21      24161\n",
+       "82      23179\n",
+       "281     22099\n",
+       "15      16232\n",
+       "        ...  \n",
+       "25          1\n",
+       "30          1\n",
+       "131         1\n",
+       "31          1\n",
+       "228         1\n",
+       "Name: count, Length: 338, dtype: int64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "df['skill'] = np.unique(df['list_skills'].fillna(''), return_inverse=True)[1]\n",
+    "df['skill'].value_counts()\n",
+    "# This was a fast way; should do it properly with a q-matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a0414cee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MAPPING = {\n",
+    "    'user_id': 'user',\n",
+    "    'problem_id': 'item',\n",
+    "    'skill': 'skill',\n",
+    "    'correct': 'correct',\n",
+    "    'school_id': 'school_id'\n",
+    "}\n",
+    "subset = df[MAPPING.keys()].rename(columns=MAPPING).copy()\n",
+    "\n",
+    "subset['wins'] = 0\n",
+    "subset['fails'] = 0\n",
+    "subset['correct'] = subset['correct'].astype(int)\n",
+    "subset['school_id'] = subset['school_id'].fillna(0).astype(int)\n",
+    "subset.to_csv('data/assistments2009full/data.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a4e8c36e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user</th>\n",
+       "      <th>item</th>\n",
+       "      <th>skill</th>\n",
+       "      <th>correct</th>\n",
+       "      <th>school_id</th>\n",
+       "      <th>wins</th>\n",
+       "      <th>fails</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>73963</td>\n",
+       "      <td>76429</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>73</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>73963</td>\n",
+       "      <td>76430</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>73</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>73963</td>\n",
+       "      <td>76431</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>73</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>73963</td>\n",
+       "      <td>76432</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>73</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>73963</td>\n",
+       "      <td>76433</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>73</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    user   item  skill  correct  school_id  wins  fails\n",
+       "0  73963  76429      0        0         73     0      0\n",
+       "1  73963  76430      0        1         73     0      0\n",
+       "2  73963  76431      0        1         73     0      0\n",
+       "3  73963  76432      0        1         73     0      0\n",
+       "4  73963  76433      0        0         73     0      0"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "subset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e2f291c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1011079"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(subset)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sktm.py b/sktm.py
index 8255ec1..08e79f1 100644
--- a/sktm.py
+++ b/sktm.py
@@ -1,5 +1,7 @@
 """
 Efficient implementation of knowledge tracing machines using scikit-learn.
+
+Currently: will not work on wins and fails > 1.
 """
 import argparse
 from sklearn.pipeline import Pipeline
@@ -19,7 +21,7 @@
 df = pd.read_csv(options.csv_file)
 pipe = Pipeline([
     ('onehot', OneHotEncoder(handle_unknown='ignore')),
-    ('lr', LogisticRegression(solver='liblinear'))
+    ('lr', LogisticRegression(solver='liblinear', C=1e-1, max_iter=300))
 ])
 
 
@@ -54,3 +56,23 @@
     pipe.fit(df_train[['skill', 'wins', 'fails']], df_train['correct'])
     print(pipe.predict_proba(df_test[['skill', 'wins', 'fails']])[:, 1])
     break
+
+print('Full training PFA')
+pipe.fit(df[['skill', 'wins', 'fails']], df['correct'])
+print(pipe['lr'].coef_)
+
+print('Full training UISWF')
+pipe.fit(df[['user', 'item', 'skill', 'wins', 'fails']], df['correct'])
+
+# Test for dummy dataset
+coef = pipe['lr'].coef_[0]
+print(coef.shape)
+print(coef)
+print(df.nunique())
+nb = [5, 2, 2, 1, 2]
+print(sum(nb))
+print('Users', coef[:5])
+print('Items', coef[5:7])
+print('Skills', coef[7:9])
+print('Wins', coef[9:10])
+print('Fails', coef[10:12])

	order_id	assignment_id	user_id	assistment_id	problem_id	correct	attempt_count	ms_first_response_time	tutor_mode	answer_type	sequence_id	student_class_id	position	problem_set_type	base_sequence_id	list_skill_ids	list_skills	teacher_id	school_id
0	20224085	232368	73963	42904	76429	0.0	3	106016.0	tutor	choose_1	6272.0	11816.0	93.0	MasterySection	6272.0	NaN	NaN	22763.0	73.0
1	20224095	232368	73963	42904	76430	1.0	1	194187.0	tutor	choose_1	6272.0	11816.0	93.0	MasterySection	6272.0	NaN	NaN	22763.0	73.0
2	20224113	232368	73963	42904	76431	1.0	1	12734.0	tutor	algebra	6272.0	11816.0	93.0	MasterySection	6272.0	NaN	NaN	22763.0	73.0
3	20224123	232368	73963	42904	76432	1.0	1	333484.0	tutor	choose_1	6272.0	11816.0	93.0	MasterySection	6272.0	NaN	NaN	22763.0	73.0
4	20224142	232368	73963	42904	76433	0.0	2	52828.0	tutor	algebra	6272.0	11816.0	93.0	MasterySection	6272.0	NaN	NaN	22763.0	73.0