diff --git a/.gitignore b/.gitignore index 22ffd69..27dda23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc *.env mergedb.json +*.joblib diff --git a/models/names/test.py b/models/names/test.py index 3df577b..41fc37d 100644 --- a/models/names/test.py +++ b/models/names/test.py @@ -61,7 +61,31 @@ def test_model(conn: sqlite3.Connection, results_path: str, model: str): accuracy = (tp + tn) / (pos + neg) precision = tp / (tp + fp) recall = tp / pos - f1 = 2 * precision * recall / (precision + recall) + + if pos + neg == 0: + print(f"Why are you testing with no data?") + sys.exit() + accuracy = (tp + tn) / (pos + neg) + + if tp + fp == 0: + print("Precision could not be calculated (no positive predictions)") + precision = None + else: + precision = tp / (tp + fp) + + if pos == 0: + print("Recall could not be calculated - why are you testing with no positive samples in the set?") + recall = None + else: + recall = tp / pos + + if precision == 0 or recall == 0: + f1 = 0 + elif precision is None or recall is None: + f1 = None + else: + f1 = 2 * precision * recall / (precision + recall) + print(f"Accuracy: {accuracy * 100:.3f}%") results = { "pos": pos, diff --git a/models/names/train_adaboost.py b/models/names/train_adaboost.py new file mode 100644 index 0000000..1366ae8 --- /dev/null +++ b/models/names/train_adaboost.py @@ -0,0 +1,80 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.ensemble import AdaBoostClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_adaboost.py --dbpath=""\n' +MODEL_FILE = 'names_adaboost.joblib' + +def train_adaboost(conn: sqlite3.Connection): + """Trains function name classifier using AdaBoost model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + # defaults to 50 estimators + ab = AdaBoostClassifier(n_estimators=50, random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(ab, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + ab.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(ab, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_adaboost(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_dtree.py b/models/names/train_dtree.py new file mode 100644 index 0000000..87e6231 --- /dev/null +++ b/models/names/train_dtree.py @@ -0,0 +1,79 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_dtree.py --dbpath=""\n' +MODEL_FILE = 'names_dtree.joblib' + +def train_decision_tree(conn: sqlite3.Connection): + """Trains function name classifier using Decision Tree model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + tree = DecisionTreeClassifier(random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(tree, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + tree.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(tree, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_decision_tree(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_gnbayes.py b/models/names/train_gnbayes.py index 9fa978c..151a311 100644 --- a/models/names/train_gnbayes.py +++ b/models/names/train_gnbayes.py @@ -10,7 +10,7 @@ MODEL_FILE = 'names_gnbayes.joblib' def train_naive_bayes(conn: sqlite3.Connection): - """Trains function name classifier using Gaussian Naive Bayes (scikit-learn) model and saves it to a file.""" + """Trains function name classifier using Gaussian Naive Bayes model (scikit-learn) and saves it to a file.""" cur = conn.cursor() print('Loading FastText model...') @@ -37,11 +37,12 @@ def train_naive_bayes(conn: sqlite3.Connection): x_train = utils.listify(x_train['lit_vec'].to_list()) y_train = tuple(y_train.to_list()) - # scaling + print("Scaling data...") scaler = StandardScaler() scaler.fit(x_train) scaler.transform(x_train) + print('Initializing classifier model...') gnb = GaussianNB() print("Cross-validation (5-fold)...") diff --git a/models/names/train_knn.py b/models/names/train_knn.py new file mode 100644 index 0000000..8fc15e8 --- /dev/null +++ b/models/names/train_knn.py @@ -0,0 +1,80 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_knn.py --dbpath=""\n' +MODEL_FILE = 'names_knn.joblib' + +def train_nearest_neighbours(conn: sqlite3.Connection): + """Trains function name classifier using k-Nearest Neighbors model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + # 5 neighbors is the default + knn = KNeighborsClassifier(n_neighbors=5) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(knn, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + knn.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(knn, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_nearest_neighbours(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_logreg.py b/models/names/train_logreg.py new file mode 100644 index 0000000..270f2c9 --- /dev/null +++ b/models/names/train_logreg.py @@ -0,0 +1,79 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_logreg.py --dbpath=""\n' +MODEL_FILE = 'names_logreg.joblib' + +def train_logistic_regression(conn: sqlite3.Connection): + """Trains function name classifier using Logistic Regression model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + lr = LogisticRegression(random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(lr, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + lr.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(lr, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_logistic_regression(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_lsvc.py b/models/names/train_lsvc.py new file mode 100644 index 0000000..12efed6 --- /dev/null +++ b/models/names/train_lsvc.py @@ -0,0 +1,79 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.svm import LinearSVC +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_lsvc.py --dbpath=""\n' +MODEL_FILE = 'names_lsvc.joblib' + +def train_linear_svc(conn: sqlite3.Connection): + """Trains function name classifier using Linear Support Vector model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + svc = LinearSVC(dual='auto', random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(svc, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + svc.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(svc, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_linear_svc(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_nn.py b/models/names/train_nn.py new file mode 100644 index 0000000..49f95aa --- /dev/null +++ b/models/names/train_nn.py @@ -0,0 +1,80 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_nn.py --dbpath=""\n' +MODEL_FILE = 'names_nn.joblib' + +def train_neural_network(conn: sqlite3.Connection): + """Trains function name classifier using Multi-layer Perceptron model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + # defaults + mlp = MLPClassifier(solver='adam', max_iter=200, random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(mlp, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + mlp.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(mlp, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_neural_network(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/train_rforest.py b/models/names/train_rforest.py new file mode 100644 index 0000000..205f66a --- /dev/null +++ b/models/names/train_rforest.py @@ -0,0 +1,79 @@ +import sqlite3, sys, os, getopt, pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils + + +HELP = 'Usage:\npython train_rforest.py --dbpath=""\n' +MODEL_FILE = 'names_rforest.joblib' + +def train_random_forest(conn: sqlite3.Connection): + """Trains function name classifier using Random Forest model (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(literals, labels) + + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print('Initializing classifier model...') + rf = RandomForestClassifier(random_state=0) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(rf, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + rf.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(rf, file_path) + print(f'Model saved to {file_path}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_random_forest(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/names/utils.py b/models/names/utils.py index 462c301..08c76b9 100644 --- a/models/names/utils.py +++ b/models/names/utils.py @@ -108,9 +108,9 @@ def save_results(results: dict, table: str, dbpath: str): fp INTEGER NOT NULL, fn INTEGER NOT NULL, accuracy REAL NOT NULL, - precision REAL NOT NULL, - recall REAL NOT NULL, - f1 REAL NOT NULL)''') + precision REAL, + recall REAL, + f1 REAL)''') except Exception as ex: print(ex) sys.exit() diff --git a/models/names_gnbayes.joblib b/models/names_gnbayes.joblib deleted file mode 100644 index bdc470d..0000000 Binary files a/models/names_gnbayes.joblib and /dev/null differ diff --git a/models/paths/test.py b/models/paths/test.py index 7fea771..c1bbb89 100644 --- a/models/paths/test.py +++ b/models/paths/test.py @@ -63,10 +63,32 @@ def test_model(conn: sqlite3.Connection, results_path: str, model: str): tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel() pos = tp + fn neg = tn + fp + + if pos + neg == 0: + print(f"Why are you testing with no data?") + sys.exit() accuracy = (tp + tn) / (pos + neg) - precision = tp / (tp + fp) - recall = tp / pos - f1 = 2 * precision * recall / (precision + recall) + + if tp + fp == 0: + print("Precision could not be calculated (no positive predictions)") + precision = None + else: + precision = tp / (tp + fp) + + if pos == 0: + print("Recall could not be calculated - why are you testing with no positive samples in the set?") + recall = None + else: + recall = tp / pos + + if precision is None or recall is None: + f1 = None + else: + if precision == 0 or recall == 0: + f1 = 0 + else: + f1 = 2 * precision * recall / (precision + recall) + print(f"Accuracy: {accuracy * 100:.3f}%") results = { "pos": pos, diff --git a/models/paths/train_adaboost.py b/models/paths/train_adaboost.py new file mode 100644 index 0000000..53a30b8 --- /dev/null +++ b/models/paths/train_adaboost.py @@ -0,0 +1,90 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.ensemble import AdaBoostClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_adaboost.py --dbpath=""\n' +MODEL_FILE = 'paths_adaboost.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_adaboost(conn: sqlite3.Connection): + """Trains cross-reference path AdaBoost classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + # defaults to 50 estimators + ab = AdaBoostClassifier(n_estimators=50, random_state=0) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(ab, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + ab.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(ab, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_adaboost(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_dtree.py b/models/paths/train_dtree.py new file mode 100644 index 0000000..5c1a2e0 --- /dev/null +++ b/models/paths/train_dtree.py @@ -0,0 +1,89 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_dtree.py --dbpath=""\n' +MODEL_FILE = 'paths_dtree.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_decision_tree(conn: sqlite3.Connection): + """Trains cross-reference path Decision Tree classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + tree = DecisionTreeClassifier(random_state=0) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(tree, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + tree.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(tree, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_decision_tree(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_gnbayes.py b/models/paths/train_gnbayes.py new file mode 100644 index 0000000..6864321 --- /dev/null +++ b/models/paths/train_gnbayes.py @@ -0,0 +1,89 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.naive_bayes import GaussianNB +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_gnbayes.py --dbpath=""\n' +MODEL_FILE = 'paths_gnbayes.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_naive_bayes(conn: sqlite3.Connection): + """Trains cross-reference path Gaussian Naive Bayes classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + gnb = GaussianNB() + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(gnb, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + gnb.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(gnb, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_naive_bayes(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_knn.py b/models/paths/train_knn.py new file mode 100644 index 0000000..48feb22 --- /dev/null +++ b/models/paths/train_knn.py @@ -0,0 +1,90 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_knn.py --dbpath=""\n' +MODEL_FILE = 'paths_knn.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_nearest_neighbours(conn: sqlite3.Connection): + """Trains cross-reference path k-Nearest Neighbors classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + # 5 neighbors is the default + knn = KNeighborsClassifier(n_neighbors=5) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(knn, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + knn.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(knn, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_nearest_neighbours(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_logreg.py b/models/paths/train_logreg.py new file mode 100644 index 0000000..ad213b5 --- /dev/null +++ b/models/paths/train_logreg.py @@ -0,0 +1,91 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_logreg.py --dbpath=""\n' +MODEL_FILE = 'paths_logreg.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_logistic_regression(conn: sqlite3.Connection): + """Trains cross-reference path Logistic Regression classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + # max_iter doubled due to training warnings: + # ConvergenceWarning: lbfgs failed to converge + lr = LogisticRegression(max_iter=200, random_state=0) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(lr, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + lr.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(lr, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_logistic_regression(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_lsvc.py b/models/paths/train_lsvc.py new file mode 100644 index 0000000..76bb955 --- /dev/null +++ b/models/paths/train_lsvc.py @@ -0,0 +1,89 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.svm import LinearSVC +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_lsvc.py --dbpath=""\n' +MODEL_FILE = 'paths_lsvc.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_linear_svc(conn: sqlite3.Connection): + """Trains cross-reference path Linear Support Vector classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + svc = LinearSVC(dual='auto', random_state=0) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(svc, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + svc.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(svc, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_linear_svc(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_nn.py b/models/paths/train_nn.py new file mode 100644 index 0000000..85641f6 --- /dev/null +++ b/models/paths/train_nn.py @@ -0,0 +1,90 @@ +import sqlite3, sys, os, getopt, pandas as pd +from datetime import datetime +from sklearn.neural_network import MLPClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import PathsClassifierUtils as utils + + +HELP = 'Usage:\npython train_nn.py --dbpath=""\n' +MODEL_FILE = 'paths_nn.joblib' +COLUMNS = ['ref_depth', + 'is_upward', + 'nb_referrers', + 'nb_strings', + 'nb_referees', + 'instructions', + 'lit_vec'] +"""Training data features.""" + +def train_neural_network(conn: sqlite3.Connection): + """Trains cross-reference path Multi-layer Perceptron classifier (scikit-learn) and saves it to a file.""" + cur = conn.cursor() + start = datetime.now() + + print('Loading FastText model...') + try: + ft = utils.load_ft(utils.get_embedder_path()) + except Exception as ex: + print(ex) + sys.exit() + + print("Fetching data...") + data = utils.get_unbalanced_data(cur) + labels = data['names_func'] + data.drop(['names_func'], axis=1, inplace=True) + + print("Performing word embedding...") + data = utils.ft_embed(ft, data) + data.drop(['token_literal'], axis=1, inplace=True) + + print('Initializing classifier model...') + # defaults + mlp = MLPClassifier(solver='adam', max_iter=200, random_state=0) + + print("Splitting datasets...") + x_train, _, y_train, _ = utils.split_dataset(data, labels) + x_train = pd.DataFrame(data=x_train, columns=COLUMNS) + x_train = utils.listify(x_train) + y_train = tuple(y_train.to_list()) + + print("Scaling data...") + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + print("Cross-validation (5-fold)...") + scores = cross_val_score(mlp, X=x_train, y=y_train) + print("Accuracy: %0.3f" % (scores.mean())) + print("Std_dev: %0.3f" % (scores.std())) + + print("Training classifier...") + mlp.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(mlp, file_path) + print(f'Model saved to {file_path}') + print(f'Start time:\t{start}') + print(f'End time:\t{datetime.now()}') + +def main(argv): + db_path = "" + opts, _ = getopt.getopt(argv,"hd:",["dbpath="]) + for opt, arg in opts: + if opt == '-h': + print(HELP) + sys.exit() + elif opt in ("-d", "--dbpath"): + db_path = arg + + if db_path == "": + raise Exception(f"SQLite database path required\n{HELP}") + if not os.path.isfile(db_path): + raise Exception(f"Database not found at {db_path}") + + conn = sqlite3.connect(db_path) + train_neural_network(conn) + conn.close() + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/models/paths/train_rforest.py b/models/paths/train_rforest.py index 06eb992..b005bdb 100644 --- a/models/paths/train_rforest.py +++ b/models/paths/train_rforest.py @@ -19,7 +19,7 @@ """Training data features.""" def train_random_forest(conn: sqlite3.Connection): - """Trains cross-reference path Random Forest classifier (scikit-learn) on full dataset and saves it to a file.""" + """Trains cross-reference path Random Forest classifier (scikit-learn) and saves it to a file.""" cur = conn.cursor() start = datetime.now() @@ -48,7 +48,6 @@ def train_random_forest(conn: sqlite3.Connection): x_train = utils.listify(x_train) y_train = tuple(y_train.to_list()) - # scaling print("Scaling data...") scaler = StandardScaler() scaler.fit(x_train) diff --git a/models/paths/utils.py b/models/paths/utils.py index 65d9e57..2337fc4 100644 --- a/models/paths/utils.py +++ b/models/paths/utils.py @@ -177,9 +177,9 @@ def save_results(results: dict, table: str, dbpath: str): fp INTEGER NOT NULL, fn INTEGER NOT NULL, accuracy REAL NOT NULL, - precision REAL NOT NULL, - recall REAL NOT NULL, - f1 REAL NOT NULL)''') + precision REAL, + recall REAL, + f1 REAL)''') except Exception as ex: print(ex) sys.exit() @@ -192,9 +192,9 @@ def save_results(results: dict, table: str, dbpath: str): fp = int(results['fp']) fn = int(results['fn']) acc = float(results['accuracy']) - precision = float(results['precision']) - recall = float(results['recall']) - f1 = float(results['f1']) + precision = float(results['precision']) if results['precision'] is not None else None + recall = float(results['recall']) if results['recall'] is not None else None + f1 = float(results['f1']) if results['f1'] is not None else None try: # sql injection yay (table names cant be passed as params) cur.execute(f'INSERT INTO {table} VALUES (?,?,?,?,?,?,?,?,?,?)', diff --git a/models/paths_rforest.joblib b/models/paths_rforest.joblib deleted file mode 100644 index f43f76e..0000000 Binary files a/models/paths_rforest.joblib and /dev/null differ