diff --git a/README.md b/README.md index 16bba35..c69a7c3 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ Classification-driven function name extraction plugin for IDA Pro. ## Directories See subfolder READMEs for details on plugin and script usage: +- `models` - pretrained models with their training and testing scripts - `plugins` - IDA plugin utilities for IDB information export - `scripts` - standalone scripts for data transformation; contains dependency modules diff --git a/models/README.md b/models/README.md new file mode 100644 index 0000000..827a18a --- /dev/null +++ b/models/README.md @@ -0,0 +1,12 @@ +# Models + +Serialized classifier models with corresponding training and testing scripts. +## Directories + +* `/embedder` - [`FastText`](https://fasttext.cc/) word embedder model for token text vectorization (self-trained, with source) +* `/names` - training and test scripts for function name classifiers +## Files +Naming convention for model files is `_.`. + +* `*.joblib` - classifier models serialized with [`joblib`](https://pypi.org/project/joblib/) +* `*.ft` - FastText model serialized with [`gensim.models.FastText`](https://radimrehurek.com/gensim/models/fasttext.html) diff --git a/models/embedder/embedder.ft b/models/embedder/embedder.ft index 6731d36..04f37b9 100644 Binary files a/models/embedder/embedder.ft and b/models/embedder/embedder.ft differ diff --git a/models/embedder/train_embedder.py b/models/embedder/train_embedder.py index bf0a0fc..c8c8177 100644 --- a/models/embedder/train_embedder.py +++ b/models/embedder/train_embedder.py @@ -16,12 +16,16 @@ def save_ft(model: FastText): """Saves FastText model to a file.""" model.save('embedder.ft') -def listify(lst: list[str]) -> list[list[str]]: - """Transforms `list[str]` into a `list[list[str]]`.""" - result = [] - for elem in lst: - result.append([elem]) - return result +def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame: + """Returns a complete dataset balanced with PDB positives.""" + # calculate the nb of missing positives + nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0] + nb_pos = tokens_df.shape[0] - nb_neg + nb_missing_pos = nb_neg - nb_pos + + # deterministic shuffle + balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0) + return pd.concat([tokens_df, balancing_pos], ignore_index=True) def train_token_embedder(conn: sqlite3.Connection): """Trains text feature (token) embedding model (FastText) and saves it to a file.""" @@ -52,15 +56,7 @@ def train_token_embedder(conn: sqlite3.Connection): pdb_df.at[idx, 'is_name'] = 1 df = pd.DataFrame(data=tokens, columns=COLUMNS) - - # calculate the nb of missing positives - nb_neg = df[df['is_name'] == 0].shape[0] - nb_pos = df.shape[0] - nb_neg - nb_missing_pos = nb_neg - nb_pos - - # deterministic shuffle - balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0) - df = pd.concat([df, balancing_pos], ignore_index=True) + df = balance_dataset(df, pdb_df) # deterministic shuffle, the same splitting is used for classifier datasets train, _ = train_test_split(df, test_size=TEST_SIZE_RATIO, random_state=0) diff --git a/models/names/train_nbayes.py b/models/names/train_nbayes.py index e076ae8..0213475 100644 --- a/models/names/train_nbayes.py +++ b/models/names/train_nbayes.py @@ -1,45 +1,55 @@ import sqlite3, sys, os, getopt, pandas as pd -from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB -from gensim.models import FastText +from sklearn.preprocessing import StandardScaler +from sklearn.model_selection import cross_val_score +from joblib import dump +from utils import NameClassifierUtils as utils -HELP = 'Usage:\npython train_nbayes.py --dbpath=""\n' -COLUMNS = ['binary', 'string_addr', 'literal', 'is_name'] - -def get_embedder_path(): - """Returns the path to FastText model file.""" - models_path, _ = os.path.split(os.getcwd()) - return os.path.join(models_path, 'embedder\\embedder.ft') - -def load_ft(path: str) -> FastText: - """Loads a pretrained FastText model from a file.""" - return FastText.load(path) +HELP = 'Usage:\npython train_gnbayes.py --dbpath=""\n' +MODEL_FILE = 'names_gnbayes.joblib' def train_naive_bayes(conn: sqlite3.Connection): - """Trains function name classifier using Naive Bayes (scikit-learn) model and saves it to a file.""" + """Trains function name classifier using Gaussian Naive Bayes (scikit-learn) model and saves it to a file.""" cur = conn.cursor() - - try: - cur.execute('SELECT * FROM tokens WHERE is_name IS NOT NULL') - tokens = cur.fetchall() - # 'no such table: x' - except sqlite3.OperationalError as ex: - print(ex) - sys.exit() - - df = pd.DataFrame(data=tokens, columns=COLUMNS) - tokens = df['literal'] - labels = df['is_name'] + print("Fetching data...") + tokens = utils.query_tokens(cur) + pdb = utils.query_pdb(cur) + df = utils.balance_dataset(tokens, pdb) + print('Loading FastText model...') try: - # load FastText text embedder (Windows paths only!) - ft = load_ft(get_embedder_path()) + ft = utils.load_ft(utils.get_embedder_path()) except Exception as ex: print(ex) sys.exit() - print(ft.wv['std::cout']) - # X_train, X_test, y_train, y_test = train_test_split(df, test_size=0.1, random_state=0) + literals = df['literal'] + labels = df['is_name'] + + print("Splitting datasets...") + x_train, x_test, y_train, y_test = utils.split_dataset(literals, labels) + print("Performing word embedding...") + x_train = pd.DataFrame(data=x_train, columns = ['literal']) + x_train = utils.ft_embed(ft, x_train) + x_train = utils.listify(x_train['lit_vec'].to_list()) + y_train = tuple(y_train.to_list()) + + # scaling + scaler = StandardScaler() + scaler.fit(x_train) + scaler.transform(x_train) + + gnb = GaussianNB() + + # cross-validation + scores = cross_val_score(gnb, X=x_train, y=y_train, cv=10) + print("Accuracy: %0.3f, std_dev: %0.3f" % (scores.mean(), scores.std())) + + print("Training classifier...") + gnb.fit(X=x_train, y=y_train) + file_path = utils.get_model_path(MODEL_FILE) + dump(gnb, file_path) + print(f'Model saved to {file_path}') def main(argv): db_path = "" diff --git a/models/names/utils.py b/models/names/utils.py new file mode 100644 index 0000000..848140a --- /dev/null +++ b/models/names/utils.py @@ -0,0 +1,91 @@ +import sqlite3, sys, os, pandas as pd +from gensim.models import FastText +from sklearn.model_selection import train_test_split + +_COLUMNS = ['literal', 'is_name'] +_TEST_SIZE_RATIO = 0.2 +"""Desired percentage of test samples in the dataset.""" + +class NameClassifierUtils: + """Utility functions for function name classifiers.""" + @staticmethod + def query_tokens(cur: sqlite3.Cursor) -> pd.DataFrame: + """Returns all labelled tokens from the dataset.""" + try: + cur.execute('SELECT literal, is_name FROM tokens WHERE is_name IS NOT NULL') + tokens = cur.fetchall() + # 'no such table: x' + except sqlite3.OperationalError as ex: + print(ex) + sys.exit() + + return pd.DataFrame(data=tokens, columns=_COLUMNS) + + @staticmethod + def query_pdb(cur: sqlite3.Cursor) -> pd.DataFrame: + """Returns all PDB function names from the dataset.""" + try: + cur.execute('SELECT literal FROM pdb') + pdb = cur.fetchall() + # 'no such table: x' + except sqlite3.OperationalError as ex: + print(ex) + sys.exit() + df = pd.DataFrame(data=pdb, columns=['literal'], index=range(len(pdb))) + + df['is_name'] = '' + for idx in df.index: + df.at[idx, 'is_name'] = 1 + + return df + + @staticmethod + def get_embedder_path() -> str: + """Returns the path to FastText model file (only supports Windows paths).""" + models_path, _ = os.path.split(os.getcwd()) + return os.path.join(models_path, 'embedder\\embedder.ft') + + @staticmethod + def get_model_path(filename: str) -> str: + """Returns the target path for model file.""" + models_path, _ = os.path.split(os.getcwd()) + return os.path.join(models_path, filename) + + @staticmethod + def load_ft(path: str) -> FastText: + """Loads a pretrained FastText model from a file.""" + return FastText.load(path) + + @staticmethod + def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame: + """Returns a complete dataset balanced with PDB positives.""" + # calculate the nb of missing positives + nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0] + nb_pos = tokens_df.shape[0] - nb_neg + nb_missing_pos = nb_neg - nb_pos + + # deterministic shuffle + balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0) + return pd.concat([tokens_df, balancing_pos], ignore_index=True) + + @staticmethod + def split_dataset(features: pd.DataFrame, labels: pd.DataFrame) -> tuple: + """Parameterized wrapper for `sklearn.model_selection.train_test_split`.""" + # Deterministic shuffle + x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=_TEST_SIZE_RATIO, random_state=0) + return x_train, x_test, y_train, y_test + + @staticmethod + def ft_embed(ft: FastText, tokens: pd.DataFrame): + """Performs vectorization on token text data.""" + tokens['lit_vec'] = '' + for idx in tokens.index: + tokens.at[idx, 'lit_vec'] = ft.wv[tokens.at[idx, 'literal']] + return tokens + + def listify(lst: list) -> list[list]: + """Transforms `list[numpy.array]` into a `list[list[any]]`.""" + result = [] + for elem in lst: + result.append([elem.tolist()]) + return result diff --git a/models/names_gnbayes.joblib b/models/names_gnbayes.joblib new file mode 100644 index 0000000..2c8b539 Binary files /dev/null and b/models/names_gnbayes.joblib differ