Version 1.0.0 release

ZaydH · Mar 15, 2019 · 843393d · 843393d
1 parent da9101f
commit 843393d
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 34 deletions.
diff --git a/malgan/__init__.py b/malgan/__init__.py
@@ -23,7 +23,6 @@
 import tensorboardX
 import torch
 import torch.nn as nn
-# import torch.nn.utils
 import torch.optim
 import torch.utils.data
 from torch.utils.data import Dataset, DataLoader, Subset
@@ -93,7 +92,7 @@ class MalGAN(nn.Module):
 
     MALWARE_BATCH_SIZE = 32
 
-    EXPORT_DIR = Path("export")
+    SAVED_MODEL_DIR = Path("saved_models")
 
     VALIDATION_SPLIT = 0.2
 
@@ -247,7 +246,7 @@ def fit_one_cycle(self, cyc_len: int, quiet_mode: bool = False) -> None:
             valid_l_g = self._meas_loader_gen_loss(self._mal_data.valid)
             MalGAN.tensorboard.add_scalar('Validation_Generator_Loss', valid_l_g, epoch_cnt)
             if valid_l_g < best_loss:
-                self._export(self._build_export_name(epoch_cnt))
+                self._save(self._build_export_name(epoch_cnt))
                 if best_epoch is not None:
                     self._delete_old_backup(best_epoch)
                 best_epoch, best_loss = epoch_cnt, valid_l_g
@@ -258,7 +257,7 @@ def fit_one_cycle(self, cyc_len: int, quiet_mode: bool = False) -> None:
         MalGAN.tensorboard.close()
 
         self.load(self._build_export_name(best_epoch))
-        self._export(self._build_export_name())
+        self._save(self._build_export_name())
         self._delete_old_backup(best_epoch)
 
     def _build_export_name(self, epoch_num: int = None) -> str:
@@ -276,7 +275,7 @@ def _build_export_name(self, epoch_num: int = None) -> str:
                 "final" if epoch_num is None else "epoch_%05d" % epoch_num]
 
         # Either add an epoch name or
-        return MalGAN.EXPORT_DIR / "".join(["_".join(name).lower(), ".pth"])
+        return MalGAN.SAVED_MODEL_DIR / "".join(["_".join(name).lower(), ".pth"])
 
     def _delete_old_backup(self, epoch_num: int) -> None:
         """
@@ -376,40 +375,45 @@ def measure_and_export_results(self) -> str:
         logging.debug("Final Validation Loss: %.6f", valid_loss)
         logging.debug("Final Test Loss: %.6f", test_loss)
 
-        num_bits_changed = num_mal_test = 0
-        m_prime_arr = []
+        num_mal_test = 0
+        y_mal_orig, m_prime_arr, bits_changed = [], [], []
         for m, _ in self._mal_data.test:
+            y_mal_orig.append(self._bb.predict(m.cpu()))
             if self._is_cuda:
                 m = m.cuda()
             num_mal_test += m.shape[0]
 
             m_prime, _ = self._gen.forward(m)
             m_prime_arr.append(m_prime.cpu() if self._is_cuda else m_prime)
+
             m_diff = m_prime - m
-            num_bits_changed += torch.sum(m_diff)
-            # Error check no bits flipped 1 -> 0
+            bits_changed.append(torch.sum(m_diff.cpu(), dim=1))
+
+            # Sanity check no bits flipped 1 -> 0
             msg = "Malware signature changed to 0 which is not allowed"
             assert torch.sum(m_diff < -0.1) == 0, msg
-        avg_changed_bits = num_bits_changed / num_mal_test
+        avg_changed_bits = torch.cat(bits_changed).mean()
         logging.debug("Avg. Malware Bits Changed Changed: %2f", avg_changed_bits)
 
+        # BB prediction of the malware before the generator
+        y_mal_orig = torch.cat(y_mal_orig)
+
         # Build an X tensor for prediction using the detector
-        ben_test_arr = []
-        for x_tmp, _ in self._ben_data.test:
-            ben_test_arr.append(x_tmp.cpu() if self._is_cuda else x_tmp)
+        ben_test_arr = [x.cpu() if self._is_cuda else x for x, _ in self._ben_data.test]
         x = torch.cat(m_prime_arr + ben_test_arr)
-        y = torch.cat((torch.full((num_mal_test,), MalGAN.Label.Malware.value),
-                       torch.full((len(x) - num_mal_test,), MalGAN.Label.Benign.value)))
+        y_actual = torch.cat((torch.full((num_mal_test,), MalGAN.Label.Malware.value),
+                             torch.full((len(x) - num_mal_test,), MalGAN.Label.Benign.value)))
 
-        y_hat = self._bb.predict(x)
+        y_hat_post = self._bb.predict(x)
         if self._is_cuda:
-            y_hat, y = y_hat.cpu(), y.cpu()
+            y_mal_orig, y_hat_post, y_actual = y_mal_orig.cpu(), y_hat_post.cpu(), y_actual.cpu()
         # noinspection PyProtectedMember
         y_prob = self._bb._model.predict_proba(x)  # pylint: disable=protected-access
         y_prob = y_prob[:, MalGAN.Label.Malware.value]
-        return _export_results(self, valid_loss, test_loss, avg_changed_bits, y, y_prob, y_hat)
+        return _export_results(self, valid_loss, test_loss, avg_changed_bits, y_actual,
+                               y_mal_orig, y_prob, y_hat_post)
 
-    def _export(self, file_path: PathOrStr) -> None:
+    def _save(self, file_path: PathOrStr) -> None:
         r"""
         Export the specified model to disk.  The function creates any files needed on the path.
         All exported models will be relative to \p EXPORT_DIR class object.

diff --git a/malgan/_export_results.py b/malgan/_export_results.py
@@ -8,51 +8,58 @@
 from sklearn.metrics import confusion_matrix, roc_auc_score
 
 TensorOrFloat = Union[torch.Tensor, float]
+TorchOrNumpy = Union[torch.Tensor, np.ndarray]
 
 
 # noinspection PyProtectedMember,PyUnresolvedReferences
 def _export_results(model: 'MalGAN', valid_loss: TensorOrFloat, test_loss: TensorOrFloat,
-                    avg_num_bits_changed: TensorOrFloat, y: np.ndarray,
-                    y_prob: Union[np.ndarray, torch.Tensor], y_hat: np.ndarray) -> str:
+                    avg_num_bits_changed: TensorOrFloat, y_actual: np.ndarray,
+                    y_mal_orig: TorchOrNumpy, y_prob: TorchOrNumpy, y_hat: np.ndarray) -> str:
     r"""
     Exports MalGAN results.
 
     :param model: MalGAN model
     :param valid_loss: Average loss on the malware validation set
     :param test_loss: Average loss on the malware test set
     :param avg_num_bits_changed:
-    :param y: Actual labels
+    :param y_actual: Actual labels
+    :param y_mal_orig: Predicted value on the original (unmodified) malware
     :param y_prob: Probability of malware
     :param y_hat: Predict labels
     :return: Results string
     """
     if isinstance(y_prob, torch.Tensor):
         y_prob = y_prob.numpy()
+    if isinstance(y_mal_orig, torch.Tensor):
+        y_mal_orig = y_mal_orig.numpy()
 
     results_file = Path("results.csv")
     exists = results_file.exists()
     with open(results_file, "a+") as f_out:
         header = ",".join(["time_completed,M,Z,batch_size,test_set_size,detector_type,activation",
                            "gen_hidden_dim,discim_hidden_dim",
                            "avg_validation_loss,avg_test_loss,avg_num_bits_changed",
-                           "auc,tpr,fpr,fnr,tnr"])
+                           "auc,orig_mal_detect_rate,mod_mal_detect_rate,ben_mal_detect_rate"])
         if not exists:
             f_out.write(header)
 
         results = ["\n%s" % datetime.datetime.now(),
                    "%d,%d,%d" % (model.M, model.Z, model.__class__.MALWARE_BATCH_SIZE),
-                   "%d,%s,%s" % (len(y), model._bb.type.name, model._g.__class__.__name__),
-                   "%s,%s" % (str(model.d_gen), str(model.d_discrim)),
+                   "%d,%s,%s" % (len(y_actual), model._bb.type.name, model._g.__class__.__name__),
+                   "\"%s\",\"%s\"" % (str(model.d_gen), str(model.d_discrim)),
                    "%.15f,%.15f,%.3f" % (valid_loss, test_loss, avg_num_bits_changed)]
 
-        auc = roc_auc_score(y, y_prob)
-        results.append("%.6f" % auc)
+        auc = roc_auc_score(y_actual, y_prob)
+        results.append("%.8f" % auc)
+
+        # Calculate the detection rate on unmodified malware
+        results.append("%.8f" % y_mal_orig.mean())
 
         # Write the TxR and NxR information
-        tn, fp, fn, tp = confusion_matrix(y, y_hat).ravel()
-        tnr, tpr = tn / (tn + fp), tp / (tp + fn)
-        for rate in [tpr, 1 - tnr, 1 - tpr, tnr]:
-            results.append("%.6f" % rate)
+        tn, fp, fn, tp = confusion_matrix(y_actual, y_hat).ravel()
+        tpr, fpr = tp / (tp + fn), fp / (tn + fp)
+        for rate in [tpr, fpr]:
+            results.append("%.8f" % rate)
         results = ",".join(results)
         f_out.write(results)
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 tqdm==4.30.0
-numpy==1.16.1
+numpy>=1.16.0
 torch==1.0.0
-typing==3.6.6
-scikit_learn==0.20.2
+typing>=3.6.6
+scikit_learn>=0.20.3
 tensorboardX==1.6