Added initial version of the Malware GAN files.

ZaydH · Feb 20, 2019 · da9101f · da9101f
1 parent 303b8a8
commit da9101f
Show file tree

Hide file tree

Showing 11 changed files with 1,014 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,9 @@
+# Miscellaneous Files
+.idea/
+tags
+.DS_Store
+*.swp
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -0,0 +1,43 @@
+# Adversarial Malware Generation Using GANs
+
+[![docs](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/ZaydH/MalwareGAN/blob/master/LICENSE)
+
+Implementation of a Generative Adversarial Network (GAN) that can create adversarial malware examples.  The work is inspired by **MalGAN** in the paper "[*Generating Adversarial Malware Examples for Black-Box Attacks Based on GAN*](https://arxiv.org/abs/1702.05983)" by Weiwei Hu and Ying Tan.
+
+Framework written in [PyTorch](https://pytorch.org/) and supports CUDA.
+
+## Running the Script
+
+The malware GAN is provided as a package in the folder `malgan`.  A driver script is provided in `main.py`, which processes input arguments via `argparse`.  The basic interface is:
+
+    python main.py Z BATCH_SIZE NUM_EPOCHS MALWARE_FILE BENIGN_FILE
+
+* `Z` -- Dimension of the latent vector.  Must be a positive integer.
+* `BATCH_SIZE` -- Batch size for *malicious* examples.  The benign batch size is proportional to `BATCH_SIZE` and the fraction of total training samples that are benign.
+* `NUM_EPOCHS` -- Maximum number of training epochs
+* `MALWARE_FILE` -- Path to a serialized `numpy` or `torch` matrix where the rows represent a single **malware** file's binary feature vector.
+* `BENIGN_FILE` -- Path to a serialized `numpy` or `torch` matrix where the rows represent a single **benign** file's binary feature vector.
+
+For checkout purposes, we recommend calling:
+
+    python main.py 10 32 100 data/trial_mal.npy data/trial_ben.npy 
+
+## Dataset
+
+A trial dataset is included with this implementation in the `data` folder.  The data was publish in the repository: [yanminglai/Malware-GAN](https://github.com/yanminglai/Malware-GAN).  This dataset should only be used for proof of concept and initial trials. 
+
+We recommend the SLEIPNIR dataset.  It was published by ad-Dujaili et al.  The authors requested that the dataset not be shared publicly, and we respect that request.  However, researchers and students may request access directly from the authors as described on their [Github repository](https://github.com/ALFA-group/robust-adv-malware-detection).  Look for the link to the Google form.
+
+## CUDA Support
+
+The implementation supports both CPU and CUDA (i.e., GPU) execution.  If CUDA is detected on the system, the implementation defaults to CUDA support.
+
+## Requirements
+
+This program was tested with Python 3.6.5 on MacOS and on Debian Linux.  `requirements.txt` enumerates the exact packages used. A summary of the key requirements is below: 
+
+* PyTorch (`torch`) -- Ver. 1.0.0
+* Scikit-Learn (`sklearn`) -- Ver. 0.20.2
+* NumPy (`numpy`)
+* `tqdm` -- Nifty package for creating a full-featured progress bar.  If it is problematic, it can be removed.
+* TensorboardX -- If runtime profiling is not required, this can be removed.
diff --git a/data/trial_ben.npy b/data/trial_ben.npy
diff --git a/data/trial_mal.npy b/data/trial_mal.npy
diff --git a/main.py b/main.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+r"""
+    src.main
+    ~~~~~~~~
+
+    Main module for testing and debugging the MalGAN implementation.
+
+    :copyright: (c) 2019 by Zayd Hammoudeh.
+    :license: MIT, see LICENSE for more details.
+"""
+
+import argparse
+import pickle
+import sys
+import logging
+from typing import Union
+from pathlib import Path
+
+import numpy as np
+
+import torch
+from torch import nn
+
+from malgan import MalGAN, MalwareDataset, BlackBoxDetector
+
+
+def setup_logger(quiet_mode: bool, filename: str = "tester.log", log_level: int = logging.DEBUG):
+    r"""
+    Logger Configurator
+
+    Configures the test logger.
+
+    :param quiet_mode: True if quiet mode (i.e., disable logging to stdout) is used
+    :param filename: Log file name
+    :param log_level: Level to log
+    """
+    date_format = '%m/%d/%Y %I:%M:%S %p'  # Example Time Format - 12/12/2010 11:46:36 AM
+    format_str = '%(asctime)s -- %(levelname)s -- %(message)s'
+    logging.basicConfig(filename=filename, level=log_level, format=format_str, datefmt=date_format)
+
+    # Also print to stdout
+    if not quiet_mode:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(log_level)
+        formatter = logging.Formatter(format_str)
+        handler.setFormatter(formatter)
+        logging.getLogger().addHandler(handler)
+
+    # Matplotlib clutters the logger so change its log level
+    # noinspection PyProtectedMember
+    # matplotlib._log.setLevel(logging.INFO)  # pylint: disable=protected-access
+
+    logging.info("******************* New Run Beginning *****************")
+
+
+def parse_args() -> argparse.Namespace:
+    r"""
+    Parse the command line arguments
+
+    :return: Parsed argument structure
+    """
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("Z", help="Dimension of the latent vector", type=int, default=10)
+    parser.add_argument("batch_size", help="Batch size", type=int, default=32)
+    parser.add_argument("num_epoch", help="Number of training epochs", type=int, default=100)
+
+    msg = "Data file contacting the %s feature vectors"
+    for x in ["malware", "benign"]:
+        parser.add_argument(x[:3] + "_file", help=msg % x, type=Path, default="data/%s.npy" % x)
+
+    parser.add_argument("-q", help="Quiet mode", action='store_true', default=False)
+
+    help_msg = " ".join(["Dimension of the hidden layer(s) in the GENERATOR."
+                         "Multiple layers should be space separated"])
+    parser.add_argument("--gen-hidden-sizes", help=help_msg, type=int,
+                        default=[256, 256], nargs="+")
+
+    help_msg = " ".join(["Dimension of the hidden layer(s) in the DISCRIMINATOR."
+                         "Multiple layers should be space separated"])
+    parser.add_argument("--discrim-hidden-sizes", help=help_msg, type=int,
+                        default=[256, 256], nargs="+")
+
+    help_msg = " ".join(["Activation function for the generator and discriminatior hidden",
+                         "layer(s). Valid choices (case insensitive) are: \"ReLU\", \"ELU\",",
+                         "\"LeakyReLU\", \"tanh\" and \"sigmoid\"."])
+    parser.add_argument("--activation", help=help_msg, type=str, default="LeakyReLU")
+
+    help_msg = ["Learner algorithm used in the black box detector. Valid choices (case ",
+                "insensitive) include:"]
+    names = BlackBoxDetector.Type.names()
+    for i, type_name in enumerate(names):
+        if i > 0 and len(names) > 2:  # Need three options for a comma to make sense
+            help_msg.append(",")
+        if len(names) > 1 and i == len(names) - 1:  # And only makes sense if at least two options
+            help_msg.append(" and")
+        help_msg.extend([" \"", type_name, "\""])
+    help_msg.append(".")
+    parser.add_argument("--detector", help="".join(help_msg), type=str,
+                        default=BlackBoxDetector.Type.RandomForest.name)
+
+    help_msg = "Print the results to the console. Intended for slurm results analysis"
+    parser.add_argument("--print-results", help=help_msg, action="store_true", default=False)
+
+    args = parser.parse_args()
+    # noinspection PyTypeChecker
+    args.activation = _configure_activation_function(args.activation)
+    args.detector = BlackBoxDetector.Type.get_from_name(args.detector)
+    return args
+
+
+def _configure_activation_function(act_func_name: str) -> nn.Module:
+    r"""
+    Parse the activation function from a string and return the corresponding activation function
+    PyTorch module.  If the activation function cannot not be found, a \p ValueError is thrown.
+
+    **Note**: Activation function check is case insensitive.
+
+    :param act_func_name: Name of the activation function to
+    :return: Activation function module associated with the passed name.
+    """
+    act_func_name = act_func_name.lower()  # Make case insensitive
+    # Supported activation functions
+    act_funcs = [("relu", nn.ReLU), ("elu", nn.ELU), ("leakyrelu", nn.LeakyReLU), ("tanh", nn.Tanh),
+                 ("sigmoid", nn.Sigmoid)]
+    for func_name, module in act_funcs:
+        if act_func_name == func_name.lower():
+            return module
+    raise ValueError("Unknown activation function: \"%s\"" % act_func_name)
+
+
+def load_dataset(file_path: Union[str, Path], y: int) -> MalwareDataset:
+    r"""
+    Extracts the input data from disk and packages them into format expected by \p MalGAN.  Supports
+    loading files from numpy, torch, and pickle.  Other formats (based on the file extension) will
+    result in a \p ValueError.
+
+    :param file_path: Path to a NumPy data file containing tensors for the benign and malware
+                      data.
+    :param y: Y value for dataset
+    :return: MalwareDataset objects for the malware and benign files respectively.
+    """
+    file_ext = Path(file_path).suffix
+    if file_ext in {".npy", ".npz"}:
+        data = np.load(file_path)
+    elif file_ext in {".pt", ".pth"}:
+        data = torch.load(str(file_path))
+    elif file_ext == ".pk":
+        with open(str(file_path), "rb") as f_in:
+            data = pickle.load(f_in)
+    else:
+        raise ValueError("Unknown file extension.  Cannot determine how to import")
+    return MalwareDataset(x=data, y=y)
+
+
+def main():
+    args = parse_args()
+    setup_logger(args.q)
+
+    MalGAN.MALWARE_BATCH_SIZE = args.batch_size
+
+    if torch.cuda.is_available():
+        logging.info("Torch GPU Available. Device #%d", torch.cuda.current_device())
+    else:
+        logging.info("No GPU detected. Running CPU only.")
+
+    malgan = MalGAN(load_dataset(args.mal_file, MalGAN.Label.Malware.value),
+                    load_dataset(args.ben_file, MalGAN.Label.Benign.value),
+                    Z=args.Z,
+                    h_gen=args.gen_hidden_sizes,
+                    h_discrim=args.discrim_hidden_sizes,
+                    g_hidden=args.activation,
+                    detector_type=args.detector)
+    malgan.fit_one_cycle(args.num_epoch, quiet_mode=args.q)
+    results = malgan.measure_and_export_results()
+    if args.print_results:
+        print(results)
+
+
+if __name__ == "__main__":
+    main()