From eaaec379be2dc87963bc2219a84b2dcc759cfdaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jill-J=C3=AAnn=20Vie?= Date: Wed, 27 Dec 2023 21:35:14 +0900 Subject: [PATCH] Improve SKTM and doc --- README.md | 38 ++++++++-- fm.py | 172 +++++++++++++++++++++++++++---------------- sktm.py | 61 +++++++++++---- tests/test_encode.py | 5 +- 4 files changed, 190 insertions(+), 86 deletions(-) diff --git a/README.md b/README.md index 17651ee..a4a475f 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,9 @@ # Knowledge Tracing Machines -- Presented at the AAAI 2019 conference in Honolulu, Hawaii on January 27, 2019. +- Presented at the AAAI 2019 conference in Honolulu, Hawaii on January 27, 2019 [[arXiv]](https://arxiv.org/abs/1811.03388) [[slides]](https://jiji.cat/slides/aaai2019-ktm-slides.pdf). - Applied in the [Best Paper Award](https://arxiv.org/abs/1905.06873) of the EDM 2019 conference in Montreal, Canada on July 2, 2019. -See our article: [Knowledge Tracing Machines: Factorization Machines for Knowledge Tracing [pdf]](https://arxiv.org/abs/1811.03388) [[slides]](http://jiji.cat/slides/aaai2019-ktm-slides.pdf). -Comments are always welcome! - @inproceedings{Vie2019, Author = {{Vie}, Jill-J{\^e}nn and {Kashima}, Hisashi}, Booktitle = {Proceedings of the 33th {AAAI} Conference on Artificial Intelligence}, @@ -22,9 +19,9 @@ Authors: [Jill-Jênn Vie](https://jjv.ie), [Hisashi Kashima](https://hkashima.gi Presented at the [Optimizing Human Learning](https://humanlearn.io) workshop in Kingston, Jamaica on June 4, 2019. -Slides from the tutorial are available [here](doc/tuto.pdf). A notebook on Colab will be available "soon". +Slides from the tutorial are available [here](https://jjv.ie/slides/tuto.pdf). A Jupyter notebook will be available "soon" on Binder. -The tutorial makes you play with the models to assess **weak generalization**. To assess **strong generalization** and reproduce the experiments of the paper, you want to look at how folds are created in [dataio.py](https://github.com/jilljenn/ktm/blob/master/dataio.py#L12). +The tutorial makes you play with the models to assess **weak generalization**. To assess **strong generalization** and reproduce the experiments of the paper, you may want to use scikit-learn's [GroupShuffleSplit](https://scikit-learn.org/stable/modules/cross_validation.html#group-shuffle-split). ## Install @@ -64,6 +61,35 @@ you should run `encode_tw.py` instead of this file, with the `--pfa` option for ## Running +### NEW! 2024 update: efficient scikit-learn implementation + +Are you excited? If so, check [sktm.py]. + +```python +pipe = Pipeline([ + ('onehot', OneHotEncoder(handle_unknown='ignore')), + ('lr', LogisticRegression(solver='liblinear')) +]) + +# IRT +pipe.fit(df_train[['user', 'item']], df_train['correct']) +print(pipe.predict_proba(df_test[['user', 'item']])) + +# PFA +pipe.fit(df_train[['skill', 'wins', 'fails']], df_train['correct']) +print(pipe.predict_proba(df_test[['skill', 'wins', 'fails']])) +``` + +sktm contains efficient parallel cross validation over 5 folds, stratified by group (i.e. strong generalization). + +Usage: + + mkdir data/assistments09 + wget https://jiji.cat/weasel2018/data.csv -P data/assistments09 + python sktm.py --dataset assistments09 --model (irt|pfa|sktm) # Choose which model + +For factorization machines, replace `LogisticRegression` with `from fm import FMClassifier`. + ### Available datasets - [Assistments 2009](https://sites.google.com/site/assistmentsdata/home/2009-2010-assistment-data) diff --git a/fm.py b/fm.py index 98d8a2e..ec79f46 100644 --- a/fm.py +++ b/fm.py @@ -11,73 +11,117 @@ import pywFM import numpy as np from dataio import get_paths, load_folds +import sklearn # Location of libFM's compiled binary file os.environ['LIBFM_PATH'] = str(Path('libfm/bin').absolute()) + '/' -parser = argparse.ArgumentParser(description='Run FM') -parser.add_argument('X_file', type=str, nargs='?') -parser.add_argument('--iter', type=int, nargs='?', default=20) -parser.add_argument('--d', type=int, nargs='?', default=20) -parser.add_argument('--subset', type=int, nargs='?', default=0) -parser.add_argument('--metrics', type=bool, nargs='?', const=True, - default=False) -parser.add_argument('--folds', type=str, nargs='?', default='weak') -options = parser.parse_args() - - -df, X_file, folder, y_file, y_pred_file = get_paths(options, 'FM') -X_sp = load_npz(X_file).tocsr() -nb_samples, _ = X_sp.shape -y = np.load(y_file).astype(np.int32) - - -predictions = [] -params = { - 'task': 'classification', - 'num_iter': options.iter, - 'rlog': True, - 'learning_method': 'mcmc', - 'k2': options.d -} -fm = pywFM.FM(**params) -for i, (i_train, i_test) in enumerate(load_folds(options, df)): - X_train, X_test, y_train, y_test = (X_sp[i_train], X_sp[i_test], - y[i_train], y[i_test]) - - model = fm.run(X_train, y_train, X_test, y_test) - y_pred_test = np.array(model.predictions) - - predictions.append({ - 'fold': 0, - 'pred': y_pred_test.tolist(), - 'y': y_test.tolist() - }) - - if options.metrics: - df_test = df.iloc[i_test] - assert len(df_test) == len(y_pred_test) - df_test['pred'] = y_pred_test - df_test.to_csv(y_pred_file, index=False) - - print('Test predict:', y_pred_test) - print('Test was:', y_test) - print('Test ACC:', np.mean(y_test == np.round(y_pred_test))) - try: - print('Test AUC', roc_auc_score(y_test, y_pred_test)) - print('Test NLL', log_loss(y_test, y_pred_test)) - except ValueError: - pass - - iso_date = datetime.now().isoformat() - np.save(folder / 'w.npy', np.array(model.weights)) - np.save(folder / 'V.npy', model.pairwise_interactions) - saved_results = { - 'predictions': predictions, - 'model': vars(options), - 'mu': model.global_bias, + +def sigmoid(x): + return 1 / (1 + np.exp(-x)) + + +class FMClassifier(sklearn.base.BaseEstimator): + def __init__(self, embedding_size=20, nb_iterations=40): + super().__init__() + self.embedding_size = embedding_size + self.nb_iterations = nb_iterations + + def fit(self, X, y): + """ + X is usually sparse, nb_samples x nb_features + y is binary + """ + fm = pywFM.FM(task='classification', num_iter=self.nb_iterations, + k2=self.embedding_size, rlog=True) # MCMC method + # rlog contains the RMSE at each epoch, we do not need it here + model = fm.run(X, y, X, y) + + # Store parameters + self.mu = model.global_bias + self.W = np.array(model.weights) + self.V = model.pairwise_interactions + self.V2 = np.power(self.V, 2) + self.rlog = model.rlog + return self + + def predict_proba(self, X): + X2 = X.copy() + if scipy.sparse.issparse(X): + X2.data **= 2 + else: + X2 **= 2 + + y_pred = (self.mu + X @ self.W + + 0.5 * (np.power(X @ self.V, 2).sum(axis=1) + - (X2 @ self.V2).sum(axis=1)).A1) + return sigmoid(y_pred) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run FM') + parser.add_argument('X_file', type=str, nargs='?') + parser.add_argument('--iter', type=int, nargs='?', default=20) + parser.add_argument('--d', type=int, nargs='?', default=20) + parser.add_argument('--subset', type=int, nargs='?', default=0) + parser.add_argument('--metrics', type=bool, nargs='?', const=True, + default=False) + parser.add_argument('--folds', type=str, nargs='?', default='weak') + options = parser.parse_args() + + + df, X_file, folder, y_file, y_pred_file = get_paths(options, 'FM') + X_sp = load_npz(X_file).tocsr() + nb_samples, _ = X_sp.shape + y = np.load(y_file).astype(np.int32) + + + predictions = [] + params = { + 'task': 'classification', + 'num_iter': options.iter, + 'rlog': True, + 'learning_method': 'mcmc', + 'k2': options.d } - with open(folder / f'results-{iso_date}.json', 'w') as f: - json.dump(saved_results, f) - break + fm = pywFM.FM(**params) + for i, (i_train, i_test) in enumerate(load_folds(options, df)): + X_train, X_test, y_train, y_test = (X_sp[i_train], X_sp[i_test], + y[i_train], y[i_test]) + + model = fm.run(X_train, y_train, X_test, y_test) + y_pred_test = np.array(model.predictions) + + predictions.append({ + 'fold': 0, + 'pred': y_pred_test.tolist(), + 'y': y_test.tolist() + }) + + if options.metrics: + df_test = df.iloc[i_test] + assert len(df_test) == len(y_pred_test) + df_test['pred'] = y_pred_test + df_test.to_csv(y_pred_file, index=False) + + print('Test predict:', y_pred_test) + print('Test was:', y_test) + print('Test ACC:', np.mean(y_test == np.round(y_pred_test))) + try: + print('Test AUC', roc_auc_score(y_test, y_pred_test)) + print('Test NLL', log_loss(y_test, y_pred_test)) + except ValueError: + pass + + iso_date = datetime.now().isoformat() + np.save(folder / 'w.npy', np.array(model.weights)) + np.save(folder / 'V.npy', model.pairwise_interactions) + saved_results = { + 'predictions': predictions, + 'model': vars(options), + 'mu': model.global_bias, + } + with open(folder / f'results-{iso_date}.json', 'w') as f: + json.dump(saved_results, f) + break diff --git a/sktm.py b/sktm.py index cf185a8..4021427 100644 --- a/sktm.py +++ b/sktm.py @@ -1,24 +1,55 @@ +""" +Efficient implementation of knowledge tracing machines using scikit-learn. +""" +import argparse from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_validate, GroupShuffleSplit import pandas as pd -df = pd.read_csv('data/dummy/data.csv') -estimators = [ - ('onehot', OneHotEncoder()), - ('lr', LogisticRegression()) -] -pipe = Pipeline(estimators) +parser = argparse.ArgumentParser(description='Run simple KTM') +parser.add_argument('--dataset', type=str, nargs='?', default='dummy') +parser.add_argument('--model', type=str, nargs='?', default='iswf') +options = parser.parse_args() -# Just check the encoded variables -ohe = OneHotEncoder() -print(ohe.fit_transform(df[['user', 'item']]).toarray()) -# IRT -pipe.fit(df[['user', 'item']], df['correct']) -print(pipe.predict_proba(df[['user', 'item']])) +df = pd.read_csv(f'data/{options.dataset}/data.csv') +pipe = Pipeline([ + ('onehot', OneHotEncoder(handle_unknown='ignore')), + ('lr', LogisticRegression(solver='liblinear')) +]) -# PFA -pipe.fit(df[['skill', 'wins', 'fails']], df['correct']) -print(pipe.predict_proba(df[['skill', 'wins', 'fails']])) + +cv = GroupShuffleSplit(n_splits=5, random_state=42) +METRICS = ['accuracy', 'roc_auc', 'neg_log_loss'] +if options.model == 'irt': + FEATURES = ['user', 'item'] +elif options.model == 'pfa': + FEATURES = ['skill', 'wins', 'fails'] +else: + FEATURES = ['item', 'skill', 'wins', 'fails'] + +cv_results = cross_validate( + pipe, df[FEATURES], df['correct'], + scoring=METRICS, # Use all scores + return_train_score=True, n_jobs=-1, # Use all cores + cv=cv, groups=df['user'], verbose=10 +) +for metric in METRICS: + print(metric, cv_results[f"test_{metric}"].mean()) + + +for i_train, i_test in cv.split(df, groups=df['user']): + df_train = df.iloc[i_train] + df_test = df.iloc[i_test] + + # IRT + pipe.fit(df_train[['user', 'item']], df_train['correct']) + print(pipe.predict_proba(df_test[['user', 'item']])[:, 1]) + + # PFA + pipe.fit(df_train[['skill', 'wins', 'fails']], df_train['correct']) + print(pipe.predict_proba(df_test[['skill', 'wins', 'fails']])[:, 1]) + break diff --git a/tests/test_encode.py b/tests/test_encode.py index d49977c..4478fa2 100644 --- a/tests/test_encode.py +++ b/tests/test_encode.py @@ -9,7 +9,10 @@ def test_encode(self): ['python', 'lr.py', 'data/dummy/X-ui.npz'], ['python', 'lr.py', '--folds', 'strong', 'data/dummy/X-ui.npz'], ['python', 'fm.py', 'data/dummy/X-ui.npz'], - ['python', 'fm.py', '--folds', 'weak', 'data/dummy/X-ui.npz'] + ['python', 'fm.py', '--folds', 'weak', 'data/dummy/X-ui.npz'], + ['python', 'sktm.py', '--model', 'irt'], + ['python', 'sktm.py', '--model', 'pfa'], + ['python', 'sktm.py', '--model', 'iswf'] ] for command in commands: p = check_output(command)