From c7e802b3bdb62962c71e85163bc1d66191f2d363 Mon Sep 17 00:00:00 2001
From: d-schindler <60650591+d-schindler@users.noreply.github.com>
Date: Fri, 18 Aug 2023 13:07:45 +0200
Subject: [PATCH] move normalised conditional entropy to utils

---
 notebooks/02_baseline_network_analysis.ipynb | 50 +-------------------
 src/utils.py                                 | 12 +++++
 2 files changed, 13 insertions(+), 49 deletions(-)

diff --git a/notebooks/02_baseline_network_analysis.ipynb b/notebooks/02_baseline_network_analysis.ipynb
index 3359947..090f95b 100644
--- a/notebooks/02_baseline_network_analysis.ipynb
+++ b/notebooks/02_baseline_network_analysis.ipynb
@@ -48,7 +48,7 @@
     "    sys.path.append(module_path)\n",
     "\n",
     "from network_analysis import remove_self_loops, visualse_largest_components\n",
-    "from utils import R2_score\n",
+    "from utils import R2_score, normalised_conditional_entropy\n",
     "\n",
     "root_figure = path+\"/figures/\"\n",
     "root_map = path+'/data/geo_shapefiles//NUTS_Level_3__January_2018__Boundaries-shp/NUTS_Level_3__January_2018__Boundaries.shp'\n",
@@ -3798,54 +3798,6 @@
     "see Lambiotte et al. 2009: belongs to the interval [0, 1], but is now an asymmetric quantity that vanishes only if each community of Pt is the union of communities of Pt"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def entropy(labels):\n",
-    "    \"\"\"Calculates the entropy for a labeling.\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    labels : int array, shape = [n_samples]\n",
-    "        The labels\n",
-    "    Notes\n",
-    "    -----\n",
-    "    The logarithm used is the natural logarithm (base-e).\n",
-    "    \"\"\"\n",
-    "    if len(labels) == 0:\n",
-    "        return 1.0\n",
-    "    label_idx = np.unique(labels, return_inverse=True)[1]\n",
-    "    pi = np.bincount(label_idx).astype(np.float64)\n",
-    "    pi = pi[pi > 0]\n",
-    "    pi_sum = np.sum(pi)\n",
-    "    # log(a / b) should be calculated as log(a) - log(b) for\n",
-    "    # possible loss of precision\n",
-    "    return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))\n",
-    "\n",
-    "def variation_of_information(x,y, normalised = True):\n",
-    "    Ex = entropy(x)\n",
-    "    Ey = entropy(y)\n",
-    "    I = metrics.mutual_info_score(x,y)\n",
-    "    \n",
-    "    if normalised:\n",
-    "        return  (Ex + Ey - 2*I) / (Ex + Ey - I)\n",
-    "    else: \n",
-    "        return Ex + Ey - 2*I\n",
-    "\n",
-    "def normalised_conditional_entropy(x,y):\n",
-    "    \"\"\"\n",
-    "    H(X|Y) = H(X) - I(X,Y) and we normalise with log(N)\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    N = len(x)\n",
-    "    Ex = entropy(x)\n",
-    "    I = metrics.mutual_info_score(x,y)\n",
-    "\n",
-    "    return (Ex - I) / np.log(N)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 38,
diff --git a/src/utils.py b/src/utils.py
index 3d1f68f..68d0db3 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -48,3 +48,15 @@ def variation_of_information(x, y, normalised=True):
         return (Ex + Ey - 2 * I) / (Ex + Ey - I)
     else:
         return Ex + Ey - 2 * I
+
+
+def normalised_conditional_entropy(x, y):
+    """
+    H(X|Y) = H(X) - I(X,Y) and we normalise with log(N)
+    """
+
+    N = len(x)
+    Ex = entropy(x)
+    I = metrics.mutual_info_score(x, y)
+
+    return (Ex - I) / np.log(N)