diff --git a/malgan/__init__.py b/malgan/__init__.py index b99974a..3df9e08 100644 --- a/malgan/__init__.py +++ b/malgan/__init__.py @@ -23,7 +23,6 @@ import tensorboardX import torch import torch.nn as nn -# import torch.nn.utils import torch.optim import torch.utils.data from torch.utils.data import Dataset, DataLoader, Subset @@ -93,7 +92,7 @@ class MalGAN(nn.Module): MALWARE_BATCH_SIZE = 32 - EXPORT_DIR = Path("export") + SAVED_MODEL_DIR = Path("saved_models") VALIDATION_SPLIT = 0.2 @@ -247,7 +246,7 @@ def fit_one_cycle(self, cyc_len: int, quiet_mode: bool = False) -> None: valid_l_g = self._meas_loader_gen_loss(self._mal_data.valid) MalGAN.tensorboard.add_scalar('Validation_Generator_Loss', valid_l_g, epoch_cnt) if valid_l_g < best_loss: - self._export(self._build_export_name(epoch_cnt)) + self._save(self._build_export_name(epoch_cnt)) if best_epoch is not None: self._delete_old_backup(best_epoch) best_epoch, best_loss = epoch_cnt, valid_l_g @@ -258,7 +257,7 @@ def fit_one_cycle(self, cyc_len: int, quiet_mode: bool = False) -> None: MalGAN.tensorboard.close() self.load(self._build_export_name(best_epoch)) - self._export(self._build_export_name()) + self._save(self._build_export_name()) self._delete_old_backup(best_epoch) def _build_export_name(self, epoch_num: int = None) -> str: @@ -276,7 +275,7 @@ def _build_export_name(self, epoch_num: int = None) -> str: "final" if epoch_num is None else "epoch_%05d" % epoch_num] # Either add an epoch name or - return MalGAN.EXPORT_DIR / "".join(["_".join(name).lower(), ".pth"]) + return MalGAN.SAVED_MODEL_DIR / "".join(["_".join(name).lower(), ".pth"]) def _delete_old_backup(self, epoch_num: int) -> None: """ @@ -376,40 +375,45 @@ def measure_and_export_results(self) -> str: logging.debug("Final Validation Loss: %.6f", valid_loss) logging.debug("Final Test Loss: %.6f", test_loss) - num_bits_changed = num_mal_test = 0 - m_prime_arr = [] + num_mal_test = 0 + y_mal_orig, m_prime_arr, bits_changed = [], [], [] for m, _ in self._mal_data.test: + y_mal_orig.append(self._bb.predict(m.cpu())) if self._is_cuda: m = m.cuda() num_mal_test += m.shape[0] m_prime, _ = self._gen.forward(m) m_prime_arr.append(m_prime.cpu() if self._is_cuda else m_prime) + m_diff = m_prime - m - num_bits_changed += torch.sum(m_diff) - # Error check no bits flipped 1 -> 0 + bits_changed.append(torch.sum(m_diff.cpu(), dim=1)) + + # Sanity check no bits flipped 1 -> 0 msg = "Malware signature changed to 0 which is not allowed" assert torch.sum(m_diff < -0.1) == 0, msg - avg_changed_bits = num_bits_changed / num_mal_test + avg_changed_bits = torch.cat(bits_changed).mean() logging.debug("Avg. Malware Bits Changed Changed: %2f", avg_changed_bits) + # BB prediction of the malware before the generator + y_mal_orig = torch.cat(y_mal_orig) + # Build an X tensor for prediction using the detector - ben_test_arr = [] - for x_tmp, _ in self._ben_data.test: - ben_test_arr.append(x_tmp.cpu() if self._is_cuda else x_tmp) + ben_test_arr = [x.cpu() if self._is_cuda else x for x, _ in self._ben_data.test] x = torch.cat(m_prime_arr + ben_test_arr) - y = torch.cat((torch.full((num_mal_test,), MalGAN.Label.Malware.value), - torch.full((len(x) - num_mal_test,), MalGAN.Label.Benign.value))) + y_actual = torch.cat((torch.full((num_mal_test,), MalGAN.Label.Malware.value), + torch.full((len(x) - num_mal_test,), MalGAN.Label.Benign.value))) - y_hat = self._bb.predict(x) + y_hat_post = self._bb.predict(x) if self._is_cuda: - y_hat, y = y_hat.cpu(), y.cpu() + y_mal_orig, y_hat_post, y_actual = y_mal_orig.cpu(), y_hat_post.cpu(), y_actual.cpu() # noinspection PyProtectedMember y_prob = self._bb._model.predict_proba(x) # pylint: disable=protected-access y_prob = y_prob[:, MalGAN.Label.Malware.value] - return _export_results(self, valid_loss, test_loss, avg_changed_bits, y, y_prob, y_hat) + return _export_results(self, valid_loss, test_loss, avg_changed_bits, y_actual, + y_mal_orig, y_prob, y_hat_post) - def _export(self, file_path: PathOrStr) -> None: + def _save(self, file_path: PathOrStr) -> None: r""" Export the specified model to disk. The function creates any files needed on the path. All exported models will be relative to \p EXPORT_DIR class object. diff --git a/malgan/_export_results.py b/malgan/_export_results.py index 11915a1..2b8251b 100644 --- a/malgan/_export_results.py +++ b/malgan/_export_results.py @@ -8,12 +8,13 @@ from sklearn.metrics import confusion_matrix, roc_auc_score TensorOrFloat = Union[torch.Tensor, float] +TorchOrNumpy = Union[torch.Tensor, np.ndarray] # noinspection PyProtectedMember,PyUnresolvedReferences def _export_results(model: 'MalGAN', valid_loss: TensorOrFloat, test_loss: TensorOrFloat, - avg_num_bits_changed: TensorOrFloat, y: np.ndarray, - y_prob: Union[np.ndarray, torch.Tensor], y_hat: np.ndarray) -> str: + avg_num_bits_changed: TensorOrFloat, y_actual: np.ndarray, + y_mal_orig: TorchOrNumpy, y_prob: TorchOrNumpy, y_hat: np.ndarray) -> str: r""" Exports MalGAN results. @@ -21,13 +22,16 @@ def _export_results(model: 'MalGAN', valid_loss: TensorOrFloat, test_loss: Tenso :param valid_loss: Average loss on the malware validation set :param test_loss: Average loss on the malware test set :param avg_num_bits_changed: - :param y: Actual labels + :param y_actual: Actual labels + :param y_mal_orig: Predicted value on the original (unmodified) malware :param y_prob: Probability of malware :param y_hat: Predict labels :return: Results string """ if isinstance(y_prob, torch.Tensor): y_prob = y_prob.numpy() + if isinstance(y_mal_orig, torch.Tensor): + y_mal_orig = y_mal_orig.numpy() results_file = Path("results.csv") exists = results_file.exists() @@ -35,24 +39,27 @@ def _export_results(model: 'MalGAN', valid_loss: TensorOrFloat, test_loss: Tenso header = ",".join(["time_completed,M,Z,batch_size,test_set_size,detector_type,activation", "gen_hidden_dim,discim_hidden_dim", "avg_validation_loss,avg_test_loss,avg_num_bits_changed", - "auc,tpr,fpr,fnr,tnr"]) + "auc,orig_mal_detect_rate,mod_mal_detect_rate,ben_mal_detect_rate"]) if not exists: f_out.write(header) results = ["\n%s" % datetime.datetime.now(), "%d,%d,%d" % (model.M, model.Z, model.__class__.MALWARE_BATCH_SIZE), - "%d,%s,%s" % (len(y), model._bb.type.name, model._g.__class__.__name__), - "%s,%s" % (str(model.d_gen), str(model.d_discrim)), + "%d,%s,%s" % (len(y_actual), model._bb.type.name, model._g.__class__.__name__), + "\"%s\",\"%s\"" % (str(model.d_gen), str(model.d_discrim)), "%.15f,%.15f,%.3f" % (valid_loss, test_loss, avg_num_bits_changed)] - auc = roc_auc_score(y, y_prob) - results.append("%.6f" % auc) + auc = roc_auc_score(y_actual, y_prob) + results.append("%.8f" % auc) + + # Calculate the detection rate on unmodified malware + results.append("%.8f" % y_mal_orig.mean()) # Write the TxR and NxR information - tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel() - tnr, tpr = tn / (tn + fp), tp / (tp + fn) - for rate in [tpr, 1 - tnr, 1 - tpr, tnr]: - results.append("%.6f" % rate) + tn, fp, fn, tp = confusion_matrix(y_actual, y_hat).ravel() + tpr, fpr = tp / (tp + fn), fp / (tn + fp) + for rate in [tpr, fpr]: + results.append("%.8f" % rate) results = ",".join(results) f_out.write(results) diff --git a/requirements.txt b/requirements.txt index 5e069cf..d99cb7f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ tqdm==4.30.0 -numpy==1.16.1 +numpy>=1.16.0 torch==1.0.0 -typing==3.6.6 -scikit_learn==0.20.2 +typing>=3.6.6 +scikit_learn>=0.20.3 tensorboardX==1.6