naive bayes training

+ training script utils for function name classifiers + gaussian naive bayes model trained with amazing 62% cross-validation accuracy score
michal-kapala · Aug 26, 2023 · 376a2e6 · 376a2e6
1 parent d143c63
commit 376a2e6
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ Classification-driven function name extraction plugin for IDA Pro.
 ## Directories
 
 See subfolder READMEs for details on plugin and script usage:
+- `models` - pretrained models with their training and testing scripts
 - `plugins` - IDA plugin utilities for IDB information export
 - `scripts` - standalone scripts for data transformation; contains dependency modules
 

diff --git a/models/README.md b/models/README.md
@@ -0,0 +1,12 @@
+# Models
+
+Serialized classifier models with corresponding training and testing scripts.
+## Directories
+
+* `/embedder` - [`FastText`](https://fasttext.cc/) word embedder model for token text vectorization (self-trained, with source)
+* `/names` - training and test scripts for function name classifiers
+## Files
+Naming convention for model files is `<classifier set>_<classifier name>.<serialization source>`.
+
+* `*.joblib` - classifier models serialized with [`joblib`](https://pypi.org/project/joblib/)
+* `*.ft` - FastText model serialized with [`gensim.models.FastText`](https://radimrehurek.com/gensim/models/fasttext.html)
diff --git a/models/embedder/embedder.ft b/models/embedder/embedder.ft
diff --git a/models/embedder/train_embedder.py b/models/embedder/train_embedder.py
@@ -16,12 +16,16 @@ def save_ft(model: FastText):
   """Saves FastText model to a file."""
   model.save('embedder.ft')
 
-def listify(lst: list[str]) -> list[list[str]]:
-  """Transforms `list[str]` into a `list[list[str]]`."""
-  result = []
-  for elem in lst:
-    result.append([elem])
-  return result
+def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame:
+  """Returns a complete dataset balanced with PDB positives."""
+  # calculate the nb of missing positives
+  nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0]
+  nb_pos = tokens_df.shape[0] - nb_neg
+  nb_missing_pos = nb_neg - nb_pos
+
+  # deterministic shuffle
+  balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
+  return pd.concat([tokens_df, balancing_pos], ignore_index=True)
 
 def train_token_embedder(conn: sqlite3.Connection):
   """Trains text feature (token) embedding model (FastText) and saves it to a file."""
@@ -52,15 +56,7 @@ def train_token_embedder(conn: sqlite3.Connection):
     pdb_df.at[idx, 'is_name'] = 1
 
   df = pd.DataFrame(data=tokens, columns=COLUMNS)
-
-  # calculate the nb of missing positives
-  nb_neg = df[df['is_name'] == 0].shape[0]
-  nb_pos = df.shape[0] - nb_neg
-  nb_missing_pos = nb_neg - nb_pos
-
-  # deterministic shuffle
-  balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
-  df = pd.concat([df, balancing_pos], ignore_index=True)
+  df = balance_dataset(df, pdb_df)
 
   # deterministic shuffle, the same splitting is used for classifier datasets
   train, _ = train_test_split(df, test_size=TEST_SIZE_RATIO, random_state=0)

diff --git a/models/names/train_nbayes.py b/models/names/train_nbayes.py
@@ -1,45 +1,55 @@
 import sqlite3, sys, os, getopt, pandas as pd
-from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB
-from gensim.models import FastText
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import cross_val_score
+from joblib import dump
+from utils import NameClassifierUtils as utils
 
 
-HELP = 'Usage:\npython train_nbayes.py --dbpath="<database path>"\n'
-COLUMNS = ['binary', 'string_addr', 'literal', 'is_name']
-
-def get_embedder_path():
-  """Returns the path to FastText model file."""
-  models_path, _ = os.path.split(os.getcwd())
-  return os.path.join(models_path, 'embedder\\embedder.ft')
-
-def load_ft(path: str) -> FastText:
-  """Loads a pretrained FastText model from a file."""
-  return FastText.load(path)
+HELP = 'Usage:\npython train_gnbayes.py --dbpath="<database path>"\n'
+MODEL_FILE = 'names_gnbayes.joblib'
 
 def train_naive_bayes(conn: sqlite3.Connection):
-  """Trains function name classifier using Naive Bayes (scikit-learn) model and saves it to a file."""
+  """Trains function name classifier using Gaussian Naive Bayes (scikit-learn) model and saves it to a file."""
   cur = conn.cursor()
-
-  try:
-    cur.execute('SELECT * FROM tokens WHERE is_name IS NOT NULL')
-    tokens = cur.fetchall()
-  # 'no such table: x'
-  except sqlite3.OperationalError as ex:
-    print(ex)
-    sys.exit()
-
-  df = pd.DataFrame(data=tokens, columns=COLUMNS)
-  tokens = df['literal']
-  labels = df['is_name']
+  print("Fetching data...")
+  tokens = utils.query_tokens(cur)
+  pdb = utils.query_pdb(cur)
+  df = utils.balance_dataset(tokens, pdb)
+  print('Loading FastText model...')
   try:
-    # load FastText text embedder (Windows paths only!)
-    ft = load_ft(get_embedder_path())
+    ft = utils.load_ft(utils.get_embedder_path())
   except Exception as ex:
     print(ex)
     sys.exit()
-  print(ft.wv['std::cout'])
-  # X_train, X_test, y_train, y_test = train_test_split(df, test_size=0.1, random_state=0)
+  literals = df['literal']
+  labels = df['is_name']
+
+  print("Splitting datasets...")
+  x_train, x_test, y_train, y_test = utils.split_dataset(literals, labels)
 
+  print("Performing word embedding...")
+  x_train = pd.DataFrame(data=x_train, columns = ['literal'])
+  x_train = utils.ft_embed(ft, x_train)
+  x_train = utils.listify(x_train['lit_vec'].to_list())
+  y_train = tuple(y_train.to_list())
+
+  # scaling
+  scaler = StandardScaler()
+  scaler.fit(x_train)
+  scaler.transform(x_train)
+
+  gnb = GaussianNB()
+
+  # cross-validation
+  scores = cross_val_score(gnb, X=x_train, y=y_train, cv=10)
+  print("Accuracy: %0.3f, std_dev: %0.3f" % (scores.mean(), scores.std()))
+
+  print("Training classifier...")
+  gnb.fit(X=x_train, y=y_train)
+  file_path = utils.get_model_path(MODEL_FILE)
+  dump(gnb, file_path)
+  print(f'Model saved to {file_path}')
 
 def main(argv):
   db_path = ""

diff --git a/models/names/utils.py b/models/names/utils.py
@@ -0,0 +1,91 @@
+import sqlite3, sys, os, pandas as pd
+from gensim.models import FastText
+from sklearn.model_selection import train_test_split
+
+_COLUMNS = ['literal', 'is_name']
+_TEST_SIZE_RATIO = 0.2
+"""Desired percentage of test samples in the dataset."""
+
+class NameClassifierUtils:
+  """Utility functions for function name classifiers."""
+  @staticmethod
+  def query_tokens(cur: sqlite3.Cursor) -> pd.DataFrame:
+    """Returns all labelled tokens from the dataset."""
+    try:
+      cur.execute('SELECT literal, is_name FROM tokens WHERE is_name IS NOT NULL')
+      tokens = cur.fetchall()
+    # 'no such table: x'
+    except sqlite3.OperationalError as ex:
+      print(ex)
+      sys.exit()
+
+    return pd.DataFrame(data=tokens, columns=_COLUMNS)
+
+  @staticmethod
+  def query_pdb(cur: sqlite3.Cursor) -> pd.DataFrame:
+    """Returns all PDB function names from the dataset."""
+    try:
+      cur.execute('SELECT literal FROM pdb')
+      pdb = cur.fetchall()
+    # 'no such table: x'
+    except sqlite3.OperationalError as ex:
+      print(ex)
+      sys.exit()
+    df = pd.DataFrame(data=pdb, columns=['literal'], index=range(len(pdb)))
+
+    df['is_name'] = ''
+    for idx in df.index:
+      df.at[idx, 'is_name'] = 1
+
+    return df
+
+  @staticmethod
+  def get_embedder_path() -> str:
+    """Returns the path to FastText model file (only supports Windows paths)."""
+    models_path, _ = os.path.split(os.getcwd())
+    return os.path.join(models_path, 'embedder\\embedder.ft')
+
+  @staticmethod
+  def get_model_path(filename: str) -> str:
+    """Returns the target path for model file."""
+    models_path, _ = os.path.split(os.getcwd())
+    return os.path.join(models_path, filename)
+
+  @staticmethod
+  def load_ft(path: str) -> FastText:
+    """Loads a pretrained FastText model from a file."""
+    return FastText.load(path)
+
+  @staticmethod
+  def balance_dataset(tokens_df: pd.DataFrame, pdb_df: pd.DataFrame) -> pd.DataFrame:
+    """Returns a complete dataset balanced with PDB positives."""
+    # calculate the nb of missing positives
+    nb_neg = tokens_df[tokens_df['is_name'] == 0].shape[0]
+    nb_pos = tokens_df.shape[0] - nb_neg
+    nb_missing_pos = nb_neg - nb_pos
+
+    # deterministic shuffle
+    balancing_pos, _ = train_test_split(pdb_df, train_size=nb_missing_pos, random_state=0)
+    return pd.concat([tokens_df, balancing_pos], ignore_index=True)
+
+  @staticmethod
+  def split_dataset(features: pd.DataFrame, labels: pd.DataFrame) -> tuple:
+    """Parameterized wrapper for `sklearn.model_selection.train_test_split`."""
+    # Deterministic shuffle
+    x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=_TEST_SIZE_RATIO, random_state=0)
+    return x_train, x_test, y_train, y_test
+
+  @staticmethod
+  def ft_embed(ft: FastText, tokens: pd.DataFrame):
+    """Performs vectorization on token text data."""
+    tokens['lit_vec'] = ''
+    for idx in tokens.index:
+      tokens.at[idx, 'lit_vec'] = ft.wv[tokens.at[idx, 'literal']]
+    return tokens
+
+  def listify(lst: list) -> list[list]:
+    """Transforms `list[numpy.array]` into a `list[list[any]]`."""
+    result = []
+    for elem in lst:
+      result.append([elem.tolist()])
+    return result
diff --git a/models/names_gnbayes.joblib b/models/names_gnbayes.joblib