Added utility functions on docs

prithagupta · prithagupta · commit 4e16003f0a8f · 2024-08-16T21:53:57.000+02:00
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -22,6 +22,7 @@ Contents
 
    notebooks/comparing_mi_estimators
    notebooks/automated_information_leakage_detection
+   notebooks/utils
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/notebooks/automated_information_leakage_detection.ipynb b/docs/source/notebooks/automated_information_leakage_detection.ipynb
@@ -17,10 +17,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "63a2d792-bd63-4f48-b64c-5bdcf0d9af40",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-08-16T19:46:29.050721Z",
+     "start_time": "2024-08-16T19:46:29.033296Z"
+    }
+   },
    "source": [
     "import logging\n",
     "import warnings\n",
@@ -31,7 +34,9 @@
     "logging.getLogger(\"pytorch\").setLevel(logging.ERROR)\n",
     "logging.getLogger(\"torch\").setLevel(logging.ERROR)\n",
     "logging.getLogger(\"urllib3.connectionpool\").setLevel(logging.ERROR)"
-   ]
+   ],
+   "outputs": [],
+   "execution_count": 1
   },
   {
    "cell_type": "code",
@@ -52,13 +57,9 @@
     }
    ],
    "source": [
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "from sklearn.model_selection import train_test_split\n",
     "from autoqild.detectors import TabPFNLeakageDetector, RandomForestLeakageDetector\n",
     "from autoqild.dataset_readers import SyntheticDatasetGeneratorDistance\n",
-    "from utils import setup_logging, setup_random_seed, create_search_space\n",
-    "import pandoc"
+    "from utils import setup_logging, setup_random_seed, create_search_space"
    ]
   },
   {
diff --git a/docs/source/notebooks/comparing_mi_estimators.ipynb b/docs/source/notebooks/comparing_mi_estimators.ipynb
@@ -39,15 +39,12 @@
    "outputs": [],
    "source": [
     "# Cell 1: Setup and Imports\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
     "from autoqild.dataset_readers import SyntheticDatasetGenerator\n",
     "from autoqild.mi_estimators.mi_estimator_classification import ClassficationMIEstimator\n",
     "from autoqild.mi_estimators import MineMIEstimatorMSE, GMMMIEstimator, TabPFNMIEstimator\n",
     "from autoqild.utilities import print_dictionary\n",
     "from utils import setup_logging, setup_random_seed\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "import pandoc"
+    "from sklearn.model_selection import train_test_split"
    ]
   },
   {
@@ -57,7 +54,7 @@
    "source": [
     "**Setting Up Experiment Logging and Random Seed:**\n",
     "\n",
-    "Initialize logging for the experiment, recording all key events in info_leakage_detection.log for tracking and debugging."
+    "Initialize logging for the experiment, recording all key events in info_leakage_detection.log for tracking and debugging. Using the utils.py file."
    ]
   },
   {
diff --git a/docs/source/notebooks/utils.py b/docs/source/notebooks/utils.py
@@ -12,8 +12,6 @@
 from sklearn.utils import check_random_state
 from skopt.space import Real, Categorical, Integer
 
-from autoqild import *
-
 
 def create_search_space(hp_ranges, logger):
     def isint(v):
@@ -45,13 +43,6 @@ def isstr(v):
     return search_space
 
 
-def convert_learner_params(params):
-    for key, value in params.items():
-        if value == "None":
-            params[key] = None
-    return params
-
-
 def setup_logging(log_path=None, level=logging.INFO):
     """Function setup as many logging for the experiments."""
     if log_path is None:
diff --git a/docs/source/notebooks/utils.rst b/docs/source/notebooks/utils.rst
@@ -0,0 +1,145 @@
+Utilities Module
+================
+
+This module contains utility functions for logging setup, search space creation, and random seed setup, designed for compatibility with TensorFlow, PyTorch, and scikit-learn.
+
+Functions
+---------
+
+create_search_space
+-------------------
+
+This function creates a hyperparameter search space based on provided ranges, supporting integers, floats, booleans, and strings.
+
+**Code:**
+
+.. code-block:: python
+
+    import inspect
+    import logging
+    import os
+    import random
+
+    import numpy as np
+    import sklearn
+    import tensorflow as tf
+    import torch
+    from packaging import version
+    from skopt.space import Real, Categorical, Integer
+
+    def create_search_space(hp_ranges, logger):
+        def isint(v):
+            return type(v) is int
+
+        def isfloat(v):
+            return type(v) is float
+
+        def isbool(v):
+            return type(v) is bool
+
+        def isstr(v):
+            return type(v) is str
+
+        search_space = {}
+        for key, value in hp_ranges.items():
+            logger.info(f"Before key {key} value {value}")
+            if version.parse(sklearn.__version__) < version.parse("0.25.0"):
+                if key == "criterion" and "squared_error" in value:
+                    value = ["friedman_mse", "mse"]
+            if isint(value[0]) and isint(value[1]):
+                search_space[key] = Integer(value[0], value[1])
+            if isfloat(value[0]) and isfloat(value[1]):
+                if len(value) == 3:
+                    search_space[key] = Real(value[0], value[1], prior=value[2])
+            if (isbool(value[0]) and isbool(value[1])) or (isstr(value[0]) and isstr(value[1])):
+                search_space[key] = Categorical(value)
+            logger.info(f"key {key} value {value}")
+        return search_space
+
+
+setup_logging
+-------------
+
+Sets up logging for experiments, allowing control over log file location and verbosity.
+
+**Code:**
+
+.. code-block:: python
+
+    def setup_logging(log_path=None, level=logging.INFO):
+        """Function setup as many logging for the experiments."""
+        if log_path is None:
+            dirname = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+            dirname = os.path.dirname(dirname)
+            log_path = os.path.join(dirname, "logs", "logs.log")
+
+        logging.basicConfig(
+            filename=log_path,
+            level=level,
+            format="%(asctime)s %(name)s %(levelname)-8s %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+            force=True,
+        )
+        logger = logging.getLogger("SetupLogging")  # root logger
+        logger.info("log file path: {}".format(log_path))
+        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Suppresses INFO, WARNING, and ERROR logs
+        # Additional TensorFlow setting to disable GPU usage explicitly
+        tf.config.set_visible_devices([], "GPU")
+        logging.captureWarnings(False)
+        import warnings
+
+        warnings.filterwarnings("ignore")
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        logging.getLogger("matplotlib").setLevel(logging.ERROR)
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("pytorch").setLevel(logging.ERROR)
+        logging.getLogger("torch").setLevel(logging.ERROR)
+        logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
+
+
+setup_random_seed
+-----------------
+
+Sets up a random seed across TensorFlow, PyTorch, NumPy, and Python’s `random` module, while also configuring CPU and GPU usage.
+
+**Code:**
+
+.. code-block:: python
+
+    def setup_random_seed(random_state=1234):
+        logger = logging.getLogger("Setup Logging")
+        random_state = check_random_state(random_state)
+
+        seed = random_state.randint(2**31, dtype="uint32")
+        torch.manual_seed(seed)
+        logger.info(f"Total number of torch threads {torch.get_num_threads()}")
+        if torch.get_num_threads() <= 2:
+            n_cpus = 1
+        else:
+            n_cpus = torch.get_num_threads() - 2
+            if "pc2" in os.environ["HOME"]:
+                n_cpus = 4
+        logger.info(f"Torch threads set {n_cpus}")
+
+        torch.set_num_threads(n_cpus)
+        tf.random.set_seed(seed)
+
+        seed = random_state.randint(2**31, dtype="uint32")
+        np.random.seed(seed)
+        random.seed(seed)
+        os.environ["KERAS_BACKEND"] = "tensorflow"
+        devices = tf.config.list_physical_devices("GPU")
+        logger.info("Keras Devices {}".format(devices))
+        n_gpus = len(devices)
+        logger.info("Keras GPU {}".format(n_gpus))
+        if n_gpus == 0:
+            cpu_count = multiprocessing.cpu_count()
+            tf.config.threading.set_inter_op_parallelism_threads(1)
+            tf.config.threading.set_intra_op_parallelism_threads(1)
+            if cpu_count > 2:
+                pass
+        else:
+            for gpu in tf.config.list_physical_devices("GPU"):
+                tf.config.experimental.set_memory_growth(gpu, True)
+        torch_gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Torch GPU device {}".format(torch_gpu))