diff --git a/part_a/matrix_factorization.py b/part_a/matrix_factorization.py index c346703..b1f02e6 100644 --- a/part_a/matrix_factorization.py +++ b/part_a/matrix_factorization.py @@ -119,8 +119,8 @@ def als(train_data, k, lr, num_iteration): losses = [] for i in range(num_iteration): - if i % (num_iteration//50) == 0: - print(f"{100*(i / num_iteration)}%") + if i % (num_iteration // 100) == 0: + print(f"{100 * (i / num_iteration)}%") losses.append((i, squared_error_loss(train_data, u, z))) u, z = update_u_z(train_data, lr, u, z) @@ -143,10 +143,16 @@ def main(): # using the validation set. # ##################################################################### # best k = 9 - # for k in [1, 5, 7, 8, 9, 10, 20]: + # scores = [] + # for k in range(1, 25): # x = svd_reconstruct(train_matrix, k) + # scores.append(sparse_matrix_evaluate(val_data, x)) # print(k, sparse_matrix_evaluate(val_data, x)) - + # plt.plot(range(1, 25), scores) + # plt.xlabel("k") + # plt.ylabel("Validation Score") + # plt.title("SVD k vs Validation Score") + # plt.show() # k = 9 # x = svd_reconstruct(train_matrix, k) # print('Train, k=9', sparse_matrix_evaluate(train_data, x)) @@ -165,18 +171,22 @@ def main(): # best hyperparamters found so far lr = 0.01 - num_iter = 1000000 k = 100 + num_iter = 800000 mat, losses = als(train_data, k, lr, num_iter) - score = sparse_matrix_evaluate(val_data, mat) - print(f"k={k}, lr={lr}, num_iter={num_iter}, score={score}") + print(f"k={k}, lr={lr}, num_iter={num_iter}") + print(f"Validation Score={sparse_matrix_evaluate(val_data, mat)}") + print(f"Test Score={sparse_matrix_evaluate(test_data, mat)}") + + # plt.plot([i[0] for i in losses], [i[1] for i in losses]) + plt.title("Squared Error vs Number of Iterations") + plt.ylabel("Squared Error") + plt.xlabel("# Iterations") plt.plot([i[0] for i in losses], [i[1] for i in losses]) plt.show() - # np.save("matrx_fac", mat) - ##################################################################### # END OF YOUR CODE # ##################################################################### diff --git a/part_b/matrix_factorization_b.py b/part_b/matrix_factorization_b.py new file mode 100644 index 0000000..e432b91 --- /dev/null +++ b/part_b/matrix_factorization_b.py @@ -0,0 +1,146 @@ +from utils import * +from scipy.linalg import sqrtm + +import numpy as np +import matplotlib.pyplot as plt +from datetime import datetime + + +def squared_error_loss(data, u, z): + """ Return the squared-error-loss given the data. + :param data: A dictionary {user_id: list, question_id: list, + is_correct: list} + :param u: 2D matrix + :param z: 2D matrix + :return: float + """ + loss = 0 + for i, q in enumerate(data["question_id"]): + loss += (data["is_correct"][i] + - np.sum(u[data["user_id"][i]] * z[q])) ** 2. + return 0.5 * loss + + +def update_u_z(train_data, lr, u: np.ndarray, z: np.ndarray): + """ Return the updated U and Z after applying + stochastic gradient descent for matrix completion. + + :param train_data: A dictionary {user_id: list, question_id: list, + is_correct: list} + :param lr: float + :param u: 2D matrix + :param z: 2D matrix + :return: (u, z) + """ + + # Randomly select a pair (user_id, question_id). + i = \ + np.random.choice(len(train_data["question_id"]), 1)[0] + + c = train_data["is_correct"][i] + n = train_data["user_id"][i] + m = train_data["question_id"][i] + u_n = u[n, :] + z_m = z[m, :] + + u_n = u_n + lr * (c - u_n.dot(z_m)) * z_m + z_m = z_m + lr * (c - u_n.dot(z_m)) * u_n + + u[n, :] = u_n + z[m, :] = z_m + + return u, z + + +def als(train_data, lr, num_iteration, initial_u: np.ndarray, initial_z: np.ndarray): + """ Performs ALS algorithm. Return reconstructed matrix. + + :param train_data: A dictionary {user_id: list, question_id: list, + is_correct: list} + :param k: int + :param lr: float + :param num_iteration: int + :param initial_u: np.ndarray + :param initial_z: np.ndarray + :return: 2D reconstructed Matrix. + """ + # Initialize u and z + u = initial_u.copy() + z = initial_z.copy() + ##################################################################### + # TODO: # + # Implement the function as described in the docstring. # + ##################################################################### + + losses = [] + for i in range(num_iteration): + if i % (num_iteration // 100) == 0: + print(f"{100 * (i / num_iteration)}%") + losses.append((i, squared_error_loss(train_data, u, z))) + u, z = update_u_z(train_data, lr, u, z) + + mat = u @ z.transpose() + ##################################################################### + # END OF YOUR CODE # + ##################################################################### + return mat, losses + + +def init_uz(train_data, question_metadata, student_metadata, num_subjects): + subject_lookup = [arr[1] for arr in + sorted(list(zip(question_metadata['question_id'], question_metadata['subject_id'])), + key=lambda x: x[0])] + + num_students = len(set(train_data["user_id"])) + num_questions = len(set(train_data["question_id"])) + + u = np.full((num_students, num_subjects), 0) + qs_in_cat = np.full((num_students, num_subjects), 0) # number of questions answered in each category per student + + z = np.full((num_questions, num_subjects), 0.1) + + total_answered = np.full(num_students, 0) + for sid, qid, corr in zip(train_data['user_id'], train_data['question_id'], train_data['is_correct']): + u[sid, subject_lookup[qid]] += corr + qs_in_cat[sid, subject_lookup[qid]] += 1 + total_answered[sid] += 1 + + for i, subjs in enumerate(subject_lookup): + z[i, subjs] = 0.9 + + with np.errstate(invalid='ignore'): + u = u / qs_in_cat + u[np.isnan(u)] = np.random.uniform(0, 1 / np.sqrt(num_subjects), u[np.isnan(u)].shape) + return u, z + + +def main(): + num_subjects = 388 + student_meta, stu_heads = load_meta("../data/student_meta.csv", [int, int, datetime.fromisoformat, float]) + question_meta, q_heads = load_meta("../data/question_meta.csv", + [int, lambda x: [int(i) for i in x[1:-1].split(', ')]]) + train_data = load_train_csv("../data") + val_data = load_valid_csv("../data") + test_data = load_public_test_csv("../data") + + # # best hyperparamters found so far + lr = 0.01 + k = 100 + num_iter = 800000 + u, z = init_uz(train_data, question_meta, student_meta, num_subjects) + mat, losses = als(train_data, lr, num_iter, u, z) + print(f"k={k}, lr={lr}, num_iter={num_iter}") + print(f"Validation Score={sparse_matrix_evaluate(val_data, mat)}") + print(f"Test Score={sparse_matrix_evaluate(test_data, mat)}") + + # plt.plot([i[0] for i in losses], [i[1] for i in losses]) + plt.title("Squared Error vs Number of Iterations") + plt.ylabel("Squared Error") + plt.xlabel("# Iterations") + + plt.plot([i[0] for i in losses], [i[1] for i in losses]) + plt.show() + + +if __name__ == "__main__": + main() diff --git a/utils.py b/utils.py index 990a6dd..29d7c61 100644 --- a/utils.py +++ b/utils.py @@ -5,6 +5,30 @@ import os +def load_meta(path, parse_fns): + # A helper function to load the csv file. + if not os.path.exists(path): + raise Exception("The specified path {} does not exist.".format(path)) + + data = {} + headers = [] + # Iterate over the row to fill in the data. + with open(path, "r") as csv_file: + reader = csv.reader(csv_file) + for row_num, row in enumerate(reader): + if row_num == 0: + headers = [h for h in row] + data = {h: [] for h in headers} + else: + for i, d in enumerate(row): + try: + data[headers[i]].append(parse_fns[i](d)) + except (TypeError, ValueError): # field is missing + data[headers[i]].append(None) + + return data, headers + + def _load_csv(path): # A helper function to load the csv file. if not os.path.exists(path):