Skip to content

Commit

Permalink
naive bayes training
Browse files Browse the repository at this point in the history
+ training script utils for function name classifiers
+ gaussian naive bayes model trained with amazing 62% cross-validation accuracy score
  • Loading branch information
michal-kapala committed Aug 26, 2023
1 parent d143c63 commit 376a2e6
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 45 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Classification-driven function name extraction plugin for IDA Pro.
## Directories

See subfolder READMEs for details on plugin and script usage:
- `models` - pretrained models with their training and testing scripts
- `plugins` - IDA plugin utilities for IDB information export
- `scripts` - standalone scripts for data transformation; contains dependency modules

Expand Down
12 changes: 12 additions & 0 deletions models/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Models

Serialized classifier models with corresponding training and testing scripts.
## Directories

* `/embedder` - [`FastText`](https://fasttext.cc/) word embedder model for token text vectorization (self-trained, with source)
* `/names` - training and test scripts for function name classifiers
## Files
Naming convention for model files is `<classifier set>_<classifier name>.<serialization source>`.

* `*.joblib` - classifier models serialized with [`joblib`](https://pypi.org/project/joblib/)
* `*.ft` - FastText model serialized with [`gensim.models.FastText`](https://radimrehurek.com/gensim/models/fasttext.html)
Binary file modified models/embedder/embedder.ft
Binary file not shown.
26 changes: 11 additions & 15 deletions models/embedder/train_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,16 @@ def save_ft(model: FastText):
"""Saves FastText model to a file."""
model.save('embedder.ft')

def listify(lst: list[str]) -> list[list[str]]:
"""Transforms `list[str]` into a `list[list[str]]`."""
result = []
for elem in lst:
result.append([elem])
return result
def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame:
"""Returns a complete dataset balanced with PDB positives."""
# calculate the nb of missing positives
nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0]
nb_pos = tokens_df.shape[0] - nb_neg
nb_missing_pos = nb_neg - nb_pos

# deterministic shuffle
balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
return pd.concat([tokens_df, balancing_pos], ignore_index=True)

def train_token_embedder(conn: sqlite3.Connection):
"""Trains text feature (token) embedding model (FastText) and saves it to a file."""
Expand Down Expand Up @@ -52,15 +56,7 @@ def train_token_embedder(conn: sqlite3.Connection):
pdb_df.at[idx, 'is_name'] = 1

df = pd.DataFrame(data=tokens, columns=COLUMNS)

# calculate the nb of missing positives
nb_neg = df[df['is_name'] == 0].shape[0]
nb_pos = df.shape[0] - nb_neg
nb_missing_pos = nb_neg - nb_pos

# deterministic shuffle
balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
df = pd.concat([df, balancing_pos], ignore_index=True)
df = balance_dataset(df, pdb_df)

# deterministic shuffle, the same splitting is used for classifier datasets
train, _ = train_test_split(df, test_size=TEST_SIZE_RATIO, random_state=0)
Expand Down
70 changes: 40 additions & 30 deletions models/names/train_nbayes.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,55 @@
import sqlite3, sys, os, getopt, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from joblib import dump
from utils import NameClassifierUtils as utils


HELP = 'Usage:\npython train_nbayes.py --dbpath="<database path>"\n'
COLUMNS = ['binary', 'string_addr', 'literal', 'is_name']

def get_embedder_path():
"""Returns the path to FastText model file."""
models_path, _ = os.path.split(os.getcwd())
return os.path.join(models_path, 'embedder\\embedder.ft')

def load_ft(path: str) -> FastText:
"""Loads a pretrained FastText model from a file."""
return FastText.load(path)
HELP = 'Usage:\npython train_gnbayes.py --dbpath="<database path>"\n'
MODEL_FILE = 'names_gnbayes.joblib'

def train_naive_bayes(conn: sqlite3.Connection):
"""Trains function name classifier using Naive Bayes (scikit-learn) model and saves it to a file."""
"""Trains function name classifier using Gaussian Naive Bayes (scikit-learn) model and saves it to a file."""
cur = conn.cursor()

try:
cur.execute('SELECT * FROM tokens WHERE is_name IS NOT NULL')
tokens = cur.fetchall()
# 'no such table: x'
except sqlite3.OperationalError as ex:
print(ex)
sys.exit()

df = pd.DataFrame(data=tokens, columns=COLUMNS)
tokens = df['literal']
labels = df['is_name']
print("Fetching data...")
tokens = utils.query_tokens(cur)
pdb = utils.query_pdb(cur)
df = utils.balance_dataset(tokens, pdb)
print('Loading FastText model...')
try:
# load FastText text embedder (Windows paths only!)
ft = load_ft(get_embedder_path())
ft = utils.load_ft(utils.get_embedder_path())
except Exception as ex:
print(ex)
sys.exit()
print(ft.wv['std::cout'])
# X_train, X_test, y_train, y_test = train_test_split(df, test_size=0.1, random_state=0)
literals = df['literal']
labels = df['is_name']

print("Splitting datasets...")
x_train, x_test, y_train, y_test = utils.split_dataset(literals, labels)

print("Performing word embedding...")
x_train = pd.DataFrame(data=x_train, columns = ['literal'])
x_train = utils.ft_embed(ft, x_train)
x_train = utils.listify(x_train['lit_vec'].to_list())
y_train = tuple(y_train.to_list())

# scaling
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)

gnb = GaussianNB()

# cross-validation
scores = cross_val_score(gnb, X=x_train, y=y_train, cv=10)
print("Accuracy: %0.3f, std_dev: %0.3f" % (scores.mean(), scores.std()))

print("Training classifier...")
gnb.fit(X=x_train, y=y_train)
file_path = utils.get_model_path(MODEL_FILE)
dump(gnb, file_path)
print(f'Model saved to {file_path}')

def main(argv):
db_path = ""
Expand Down
91 changes: 91 additions & 0 deletions models/names/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import sqlite3, sys, os, pandas as pd
from gensim.models import FastText
from sklearn.model_selection import train_test_split

_COLUMNS = ['literal', 'is_name']
_TEST_SIZE_RATIO = 0.2
"""Desired percentage of test samples in the dataset."""

class NameClassifierUtils:
"""Utility functions for function name classifiers."""
@staticmethod
def query_tokens(cur: sqlite3.Cursor) -> pd.DataFrame:
"""Returns all labelled tokens from the dataset."""
try:
cur.execute('SELECT literal, is_name FROM tokens WHERE is_name IS NOT NULL')
tokens = cur.fetchall()
# 'no such table: x'
except sqlite3.OperationalError as ex:
print(ex)
sys.exit()

return pd.DataFrame(data=tokens, columns=_COLUMNS)

@staticmethod
def query_pdb(cur: sqlite3.Cursor) -> pd.DataFrame:
"""Returns all PDB function names from the dataset."""
try:
cur.execute('SELECT literal FROM pdb')
pdb = cur.fetchall()
# 'no such table: x'
except sqlite3.OperationalError as ex:
print(ex)
sys.exit()
df = pd.DataFrame(data=pdb, columns=['literal'], index=range(len(pdb)))

df['is_name'] = ''
for idx in df.index:
df.at[idx, 'is_name'] = 1

return df

@staticmethod
def get_embedder_path() -> str:
"""Returns the path to FastText model file (only supports Windows paths)."""
models_path, _ = os.path.split(os.getcwd())
return os.path.join(models_path, 'embedder\\embedder.ft')

@staticmethod
def get_model_path(filename: str) -> str:
"""Returns the target path for model file."""
models_path, _ = os.path.split(os.getcwd())
return os.path.join(models_path, filename)

@staticmethod
def load_ft(path: str) -> FastText:
"""Loads a pretrained FastText model from a file."""
return FastText.load(path)

@staticmethod
def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame:
"""Returns a complete dataset balanced with PDB positives."""
# calculate the nb of missing positives
nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0]
nb_pos = tokens_df.shape[0] - nb_neg
nb_missing_pos = nb_neg - nb_pos

# deterministic shuffle
balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
return pd.concat([tokens_df, balancing_pos], ignore_index=True)

@staticmethod
def split_dataset(features: pd.DataFrame, labels: pd.DataFrame) -> tuple:
"""Parameterized wrapper for `sklearn.model_selection.train_test_split`."""
# Deterministic shuffle
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=_TEST_SIZE_RATIO, random_state=0)
return x_train, x_test, y_train, y_test

@staticmethod
def ft_embed(ft: FastText, tokens: pd.DataFrame):
"""Performs vectorization on token text data."""
tokens['lit_vec'] = ''
for idx in tokens.index:
tokens.at[idx, 'lit_vec'] = ft.wv[tokens.at[idx, 'literal']]
return tokens

def listify(lst: list) -> list[list]:
"""Transforms `list[numpy.array]` into a `list[list[any]]`."""
result = []
for elem in lst:
result.append([elem.tolist()])
return result
Binary file added models/names_gnbayes.joblib
Binary file not shown.

0 comments on commit 376a2e6

Please sign in to comment.