Skip to content

Commit

Permalink
model training scripts
Browse files Browse the repository at this point in the history
+ 8 models trained for both tasks
+ fixed metric-related edge cases
+ model files are now ignored due to RandomForest producing 400MB of data
  • Loading branch information
michal-kapala committed Aug 28, 2023
1 parent 9ccd967 commit e5802bd
Show file tree
Hide file tree
Showing 23 changed files with 1,248 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.pyc
*.env
mergedb.json
*.joblib
26 changes: 25 additions & 1 deletion models/names/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,31 @@ def test_model(conn: sqlite3.Connection, results_path: str, model: str):
accuracy = (tp + tn) / (pos + neg)
precision = tp / (tp + fp)
recall = tp / pos
f1 = 2 * precision * recall / (precision + recall)

if pos + neg == 0:
print(f"Why are you testing with no data?")
sys.exit()
accuracy = (tp + tn) / (pos + neg)

if tp + fp == 0:
print("Precision could not be calculated (no positive predictions)")
precision = None
else:
precision = tp / (tp + fp)

if pos == 0:
print("Recall could not be calculated - why are you testing with no positive samples in the set?")
recall = None
else:
recall = tp / pos

if precision == 0 or recall == 0:
f1 = 0
elif precision is None or recall is None:
f1 = None
else:
f1 = 2 * precision * recall / (precision + recall)

print(f"Accuracy: {accuracy * 100:.3f}%")
results = {
"pos": pos,
Expand Down
80 changes: 80 additions & 0 deletions models/names/train_adaboost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import sqlite3, sys, os, getopt, pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from joblib import dump
from utils import NameClassifierUtils as utils


HELP = 'Usage:\npython train_adaboost.py --dbpath="<database path>"\n'
MODEL_FILE = 'names_adaboost.joblib'

def train_adaboost(conn: sqlite3.Connection):
"""Trains function name classifier using AdaBoost model (scikit-learn) and saves it to a file."""
cur = conn.cursor()

print('Loading FastText model...')
try:
ft = utils.load_ft(utils.get_embedder_path())
except Exception as ex:
print(ex)
sys.exit()

print("Fetching data...")
tokens = utils.query_tokens(cur)
pdb = utils.query_pdb(cur)
df = utils.balance_dataset(tokens, pdb)

literals = df['literal']
labels = df['is_name']

print("Splitting datasets...")
x_train, _, y_train, _ = utils.split_dataset(literals, labels)

print("Performing word embedding...")
x_train = pd.DataFrame(data=x_train, columns = ['literal'])
x_train = utils.ft_embed(ft, x_train)
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

print("Scaling data...")
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

print('Initializing classifier model...')
# defaults to 50 estimators
ab = AdaBoostClassifier(n_estimators=50, random_state=0)

print("Cross-validation (5-fold)...")
scores = cross_val_score(ab, X=x_train, y=y_train)
print("Accuracy: %0.3f" % (scores.mean()))
print("Std_dev: %0.3f" % (scores.std()))

print("Training classifier...")
ab.fit(X=x_train, y=y_train)
file_path = utils.get_model_path(MODEL_FILE)
dump(ab, file_path)
print(f'Model saved to {file_path}')

def main(argv):
db_path = ""
opts, _ = getopt.getopt(argv,"hd:",["dbpath="])
for opt, arg in opts:
if opt == '-h':
print(HELP)
sys.exit()
elif opt in ("-d", "--dbpath"):
db_path = arg

if db_path == "":
raise Exception(f"SQLite database path required\n{HELP}")
if not os.path.isfile(db_path):
raise Exception(f"Database not found at {db_path}")

conn = sqlite3.connect(db_path)
train_adaboost(conn)
conn.close()

if __name__ == "__main__":
main(sys.argv[1:])
79 changes: 79 additions & 0 deletions models/names/train_dtree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import sqlite3, sys, os, getopt, pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from joblib import dump
from utils import NameClassifierUtils as utils


HELP = 'Usage:\npython train_dtree.py --dbpath="<database path>"\n'
MODEL_FILE = 'names_dtree.joblib'

def train_decision_tree(conn: sqlite3.Connection):
"""Trains function name classifier using Decision Tree model (scikit-learn) and saves it to a file."""
cur = conn.cursor()

print('Loading FastText model...')
try:
ft = utils.load_ft(utils.get_embedder_path())
except Exception as ex:
print(ex)
sys.exit()

print("Fetching data...")
tokens = utils.query_tokens(cur)
pdb = utils.query_pdb(cur)
df = utils.balance_dataset(tokens, pdb)

literals = df['literal']
labels = df['is_name']

print("Splitting datasets...")
x_train, _, y_train, _ = utils.split_dataset(literals, labels)

print("Performing word embedding...")
x_train = pd.DataFrame(data=x_train, columns = ['literal'])
x_train = utils.ft_embed(ft, x_train)
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

print("Scaling data...")
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

print('Initializing classifier model...')
tree = DecisionTreeClassifier(random_state=0)

print("Cross-validation (5-fold)...")
scores = cross_val_score(tree, X=x_train, y=y_train)
print("Accuracy: %0.3f" % (scores.mean()))
print("Std_dev: %0.3f" % (scores.std()))

print("Training classifier...")
tree.fit(X=x_train, y=y_train)
file_path = utils.get_model_path(MODEL_FILE)
dump(tree, file_path)
print(f'Model saved to {file_path}')

def main(argv):
db_path = ""
opts, _ = getopt.getopt(argv,"hd:",["dbpath="])
for opt, arg in opts:
if opt == '-h':
print(HELP)
sys.exit()
elif opt in ("-d", "--dbpath"):
db_path = arg

if db_path == "":
raise Exception(f"SQLite database path required\n{HELP}")
if not os.path.isfile(db_path):
raise Exception(f"Database not found at {db_path}")

conn = sqlite3.connect(db_path)
train_decision_tree(conn)
conn.close()

if __name__ == "__main__":
main(sys.argv[1:])
5 changes: 3 additions & 2 deletions models/names/train_gnbayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
MODEL_FILE = 'names_gnbayes.joblib'

def train_naive_bayes(conn: sqlite3.Connection):
"""Trains function name classifier using Gaussian Naive Bayes (scikit-learn) model and saves it to a file."""
"""Trains function name classifier using Gaussian Naive Bayes model (scikit-learn) and saves it to a file."""
cur = conn.cursor()

print('Loading FastText model...')
Expand All @@ -37,11 +37,12 @@ def train_naive_bayes(conn: sqlite3.Connection):
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

# scaling
print("Scaling data...")
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

print('Initializing classifier model...')
gnb = GaussianNB()

print("Cross-validation (5-fold)...")
Expand Down
80 changes: 80 additions & 0 deletions models/names/train_knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import sqlite3, sys, os, getopt, pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from joblib import dump
from utils import NameClassifierUtils as utils


HELP = 'Usage:\npython train_knn.py --dbpath="<database path>"\n'
MODEL_FILE = 'names_knn.joblib'

def train_nearest_neighbours(conn: sqlite3.Connection):
"""Trains function name classifier using k-Nearest Neighbors model (scikit-learn) and saves it to a file."""
cur = conn.cursor()

print('Loading FastText model...')
try:
ft = utils.load_ft(utils.get_embedder_path())
except Exception as ex:
print(ex)
sys.exit()

print("Fetching data...")
tokens = utils.query_tokens(cur)
pdb = utils.query_pdb(cur)
df = utils.balance_dataset(tokens, pdb)

literals = df['literal']
labels = df['is_name']

print("Splitting datasets...")
x_train, _, y_train, _ = utils.split_dataset(literals, labels)

print("Performing word embedding...")
x_train = pd.DataFrame(data=x_train, columns = ['literal'])
x_train = utils.ft_embed(ft, x_train)
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

print("Scaling data...")
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

print('Initializing classifier model...')
# 5 neighbors is the default
knn = KNeighborsClassifier(n_neighbors=5)

print("Cross-validation (5-fold)...")
scores = cross_val_score(knn, X=x_train, y=y_train)
print("Accuracy: %0.3f" % (scores.mean()))
print("Std_dev: %0.3f" % (scores.std()))

print("Training classifier...")
knn.fit(X=x_train, y=y_train)
file_path = utils.get_model_path(MODEL_FILE)
dump(knn, file_path)
print(f'Model saved to {file_path}')

def main(argv):
db_path = ""
opts, _ = getopt.getopt(argv,"hd:",["dbpath="])
for opt, arg in opts:
if opt == '-h':
print(HELP)
sys.exit()
elif opt in ("-d", "--dbpath"):
db_path = arg

if db_path == "":
raise Exception(f"SQLite database path required\n{HELP}")
if not os.path.isfile(db_path):
raise Exception(f"Database not found at {db_path}")

conn = sqlite3.connect(db_path)
train_nearest_neighbours(conn)
conn.close()

if __name__ == "__main__":
main(sys.argv[1:])
79 changes: 79 additions & 0 deletions models/names/train_logreg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import sqlite3, sys, os, getopt, pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from joblib import dump
from utils import NameClassifierUtils as utils


HELP = 'Usage:\npython train_logreg.py --dbpath="<database path>"\n'
MODEL_FILE = 'names_logreg.joblib'

def train_logistic_regression(conn: sqlite3.Connection):
"""Trains function name classifier using Logistic Regression model (scikit-learn) and saves it to a file."""
cur = conn.cursor()

print('Loading FastText model...')
try:
ft = utils.load_ft(utils.get_embedder_path())
except Exception as ex:
print(ex)
sys.exit()

print("Fetching data...")
tokens = utils.query_tokens(cur)
pdb = utils.query_pdb(cur)
df = utils.balance_dataset(tokens, pdb)

literals = df['literal']
labels = df['is_name']

print("Splitting datasets...")
x_train, _, y_train, _ = utils.split_dataset(literals, labels)

print("Performing word embedding...")
x_train = pd.DataFrame(data=x_train, columns = ['literal'])
x_train = utils.ft_embed(ft, x_train)
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

print("Scaling data...")
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

print('Initializing classifier model...')
lr = LogisticRegression(random_state=0)

print("Cross-validation (5-fold)...")
scores = cross_val_score(lr, X=x_train, y=y_train)
print("Accuracy: %0.3f" % (scores.mean()))
print("Std_dev: %0.3f" % (scores.std()))

print("Training classifier...")
lr.fit(X=x_train, y=y_train)
file_path = utils.get_model_path(MODEL_FILE)
dump(lr, file_path)
print(f'Model saved to {file_path}')

def main(argv):
db_path = ""
opts, _ = getopt.getopt(argv,"hd:",["dbpath="])
for opt, arg in opts:
if opt == '-h':
print(HELP)
sys.exit()
elif opt in ("-d", "--dbpath"):
db_path = arg

if db_path == "":
raise Exception(f"SQLite database path required\n{HELP}")
if not os.path.isfile(db_path):
raise Exception(f"Database not found at {db_path}")

conn = sqlite3.connect(db_path)
train_logistic_regression(conn)
conn.close()

if __name__ == "__main__":
main(sys.argv[1:])
Loading

0 comments on commit e5802bd

Please sign in to comment.