From 4788ba9d1e0e4a6096017b6404aac2f7863402d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jill-J=C3=AAnn=20Vie?= Date: Fri, 5 Apr 2024 17:50:33 +0200 Subject: [PATCH] Fix folds in metrics --- eval_metrics.py | 25 +- lr.py | 1 + notebooks/Prepare Assistments 2009 full.ipynb | 491 ++++++++++++++++++ sktm.py | 24 +- 4 files changed, 525 insertions(+), 16 deletions(-) create mode 100644 notebooks/Prepare Assistments 2009 full.ipynb diff --git a/eval_metrics.py b/eval_metrics.py index 37a6fbf..087e23e 100644 --- a/eval_metrics.py +++ b/eval_metrics.py @@ -51,7 +51,7 @@ def all_metrics(results, test): print('This is not the right fold', len(y), len(test)) sys.exit(0) - for user, pred, true in zip(test['user_id'], y_pred, y): + for user, pred, true in zip(test['user'], y_pred, y): predictions_per_user[user]['pred'].append(pred) predictions_per_user[user]['y'].append(true) @@ -135,6 +135,7 @@ def all_metrics(results, test): val = 0 for subgroup, auc, nb in zip(attr_ids, metrics_per_sensitive_attr['auc'], nb_samples): candidates[subgroup] = (-auc, -nb) + print(len(candidates), 'groups and ', test[SENSITIVE_ATTR].nunique(), 'schools in test') x = [] nb = [] @@ -145,6 +146,9 @@ def all_metrics(results, test): nb.append(-yi) val += 1 plt.stem(x, nb, use_line_collection=True) + plt.xlabel('AUC value') + plt.ylabel('Number of samples in group') + plt.title('For each group, number of samples per AUC value') plt.show() # Display ids of the subgroups (sensitive attribute) that have the lowest/highest AUC @@ -173,29 +177,20 @@ def all_metrics(results, test): plt.show() if __name__ == '__main__': - os.chdir('data/assistments09') + os.chdir('data/assistments2009full') # os.chdir('data/fr_en') - # indices = np.load('folds/weak278607fold0.npy') - # indices = np.load('folds/278607fold0.npy') - # indices = np.load('folds/50weak278607fold0.npy') - # indices = np.load('folds/weak926646fold0.npy') - # indices = np.load('folds/1199731fold0.npy') - # indices = np.load('folds/50weak341791fold0.npy') - indices = np.load('folds/50weak341791fold0.npy') - print(len(indices)) - - df = pd.read_csv('needed.csv') - test = df.iloc[indices] + df = pd.read_csv('data.csv') # r = re.compile(r'results-(.*).json') # ndcg_ = defaultdict(list) - for filename in sorted(glob.glob('results*2020*'))[::-1][:1]: + for filename in sorted(glob.glob('results*2024*'))[::-1][:1]: print(filename) with open(filename) as f: results = json.load(f) - all_metrics(results, test) + i_test = results['predictions'][0]['i_test'] + all_metrics(results, df.iloc[i_test]) diff --git a/lr.py b/lr.py index 7e476c3..db3d37e 100644 --- a/lr.py +++ b/lr.py @@ -71,6 +71,7 @@ if dataset == 'Test': predictions.append({ 'fold': i, + 'i_test': i_test.tolist(), 'pred': y_pred.tolist(), 'y': y.tolist() }) diff --git a/notebooks/Prepare Assistments 2009 full.ipynb b/notebooks/Prepare Assistments 2009 full.ipynb new file mode 100644 index 0000000..a87483a --- /dev/null +++ b/notebooks/Prepare Assistments 2009 full.ipynb @@ -0,0 +1,491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "689473a8", + "metadata": {}, + "source": [ + "fulldata.csv can be downloaded from \"ASSISTments 2009-2010 Full Data set\" \n", + "at https://sites.google.com/site/assistmentsdata/home/2009-2010-assistment-data?authuser=0\n", + "\n", + "We should check for duplicate lines." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d273b8fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_idassignment_iduser_idassistment_idproblem_idoriginalcorrectattempt_countms_first_response_timetutor_modeanswer_typesequence_idstudent_class_idpositionproblem_set_typebase_sequence_idlist_skill_idslist_skillsteacher_idschool_id
02022408523236873963429047642900.03106016.0tutorchoose_16272.011816.093.0MasterySection6272.0NaNNaN22763.073.0
12022409523236873963429047643001.01194187.0tutorchoose_16272.011816.093.0MasterySection6272.0NaNNaN22763.073.0
22022411323236873963429047643101.0112734.0tutoralgebra6272.011816.093.0MasterySection6272.0NaNNaN22763.073.0
32022412323236873963429047643201.01333484.0tutorchoose_16272.011816.093.0MasterySection6272.0NaNNaN22763.073.0
42022414223236873963429047643300.0252828.0tutoralgebra6272.011816.093.0MasterySection6272.0NaNNaN22763.073.0
\n", + "
" + ], + "text/plain": [ + " order_id assignment_id user_id assistment_id problem_id original \\\n", + "0 20224085 232368 73963 42904 76429 0 \n", + "1 20224095 232368 73963 42904 76430 0 \n", + "2 20224113 232368 73963 42904 76431 0 \n", + "3 20224123 232368 73963 42904 76432 0 \n", + "4 20224142 232368 73963 42904 76433 0 \n", + "\n", + " correct attempt_count ms_first_response_time tutor_mode answer_type \\\n", + "0 0.0 3 106016.0 tutor choose_1 \n", + "1 1.0 1 194187.0 tutor choose_1 \n", + "2 1.0 1 12734.0 tutor algebra \n", + "3 1.0 1 333484.0 tutor choose_1 \n", + "4 0.0 2 52828.0 tutor algebra \n", + "\n", + " sequence_id student_class_id position problem_set_type base_sequence_id \\\n", + "0 6272.0 11816.0 93.0 MasterySection 6272.0 \n", + "1 6272.0 11816.0 93.0 MasterySection 6272.0 \n", + "2 6272.0 11816.0 93.0 MasterySection 6272.0 \n", + "3 6272.0 11816.0 93.0 MasterySection 6272.0 \n", + "4 6272.0 11816.0 93.0 MasterySection 6272.0 \n", + "\n", + " list_skill_ids list_skills teacher_id school_id \n", + "0 NaN NaN 22763.0 73.0 \n", + "1 NaN NaN 22763.0 73.0 \n", + "2 NaN NaN 22763.0 73.0 \n", + "3 NaN NaN 22763.0 73.0 \n", + "4 NaN NaN 22763.0 73.0 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv('data/assistments2009full/fulldata.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c6228adb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "108" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['school_id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4b338b5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "list_skills\n", + "Addition and Subtraction Integers 24161\n", + "Conversion of Fraction Decimals Percents 23179\n", + "Proportion 22099\n", + "Addition and Subtraction Fractions 16232\n", + "Multiplication and Division Integers 13363\n", + " ... \n", + "Addition and Subtraction Integers;Table 1\n", + "Addition and Subtraction Positive Decimals;Multiplication and Division Positive Decimals;Perimeter of a Polygon;Pythagorean Theorem;Rounding;Square Root 1\n", + "Estimation;Multiplication and Division Positive Decimals;Rounding 1\n", + "Addition and Subtraction Positive Decimals;Table 1\n", + "Ordering Fractions;Table 1\n", + "Name: count, Length: 337, dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['list_skills'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "014920c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "skill\n", + "0 577989\n", + "21 24161\n", + "82 23179\n", + "281 22099\n", + "15 16232\n", + " ... \n", + "25 1\n", + "30 1\n", + "131 1\n", + "31 1\n", + "228 1\n", + "Name: count, Length: 338, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "\n", + "df['skill'] = np.unique(df['list_skills'].fillna(''), return_inverse=True)[1]\n", + "df['skill'].value_counts()\n", + "# This was a fast way; should do it properly with a q-matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a0414cee", + "metadata": {}, + "outputs": [], + "source": [ + "MAPPING = {\n", + " 'user_id': 'user',\n", + " 'problem_id': 'item',\n", + " 'skill': 'skill',\n", + " 'correct': 'correct',\n", + " 'school_id': 'school_id'\n", + "}\n", + "subset = df[MAPPING.keys()].rename(columns=MAPPING).copy()\n", + "\n", + "subset['wins'] = 0\n", + "subset['fails'] = 0\n", + "subset['correct'] = subset['correct'].astype(int)\n", + "subset['school_id'] = subset['school_id'].fillna(0).astype(int)\n", + "subset.to_csv('data/assistments2009full/data.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a4e8c36e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useritemskillcorrectschool_idwinsfails
07396376429007300
17396376430017300
27396376431017300
37396376432017300
47396376433007300
\n", + "
" + ], + "text/plain": [ + " user item skill correct school_id wins fails\n", + "0 73963 76429 0 0 73 0 0\n", + "1 73963 76430 0 1 73 0 0\n", + "2 73963 76431 0 1 73 0 0\n", + "3 73963 76432 0 1 73 0 0\n", + "4 73963 76433 0 0 73 0 0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e2f291c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1011079" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(subset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sktm.py b/sktm.py index 8255ec1..08e79f1 100644 --- a/sktm.py +++ b/sktm.py @@ -1,5 +1,7 @@ """ Efficient implementation of knowledge tracing machines using scikit-learn. + +Currently: will not work on wins and fails > 1. """ import argparse from sklearn.pipeline import Pipeline @@ -19,7 +21,7 @@ df = pd.read_csv(options.csv_file) pipe = Pipeline([ ('onehot', OneHotEncoder(handle_unknown='ignore')), - ('lr', LogisticRegression(solver='liblinear')) + ('lr', LogisticRegression(solver='liblinear', C=1e-1, max_iter=300)) ]) @@ -54,3 +56,23 @@ pipe.fit(df_train[['skill', 'wins', 'fails']], df_train['correct']) print(pipe.predict_proba(df_test[['skill', 'wins', 'fails']])[:, 1]) break + +print('Full training PFA') +pipe.fit(df[['skill', 'wins', 'fails']], df['correct']) +print(pipe['lr'].coef_) + +print('Full training UISWF') +pipe.fit(df[['user', 'item', 'skill', 'wins', 'fails']], df['correct']) + +# Test for dummy dataset +coef = pipe['lr'].coef_[0] +print(coef.shape) +print(coef) +print(df.nunique()) +nb = [5, 2, 2, 1, 2] +print(sum(nb)) +print('Users', coef[:5]) +print('Items', coef[5:7]) +print('Skills', coef[7:9]) +print('Wins', coef[9:10]) +print('Fails', coef[10:12])