From c7e802b3bdb62962c71e85163bc1d66191f2d363 Mon Sep 17 00:00:00 2001 From: d-schindler <60650591+d-schindler@users.noreply.github.com> Date: Fri, 18 Aug 2023 13:07:45 +0200 Subject: [PATCH] move normalised conditional entropy to utils --- notebooks/02_baseline_network_analysis.ipynb | 50 +------------------- src/utils.py | 12 +++++ 2 files changed, 13 insertions(+), 49 deletions(-) diff --git a/notebooks/02_baseline_network_analysis.ipynb b/notebooks/02_baseline_network_analysis.ipynb index 3359947..090f95b 100644 --- a/notebooks/02_baseline_network_analysis.ipynb +++ b/notebooks/02_baseline_network_analysis.ipynb @@ -48,7 +48,7 @@ " sys.path.append(module_path)\n", "\n", "from network_analysis import remove_self_loops, visualse_largest_components\n", - "from utils import R2_score\n", + "from utils import R2_score, normalised_conditional_entropy\n", "\n", "root_figure = path+\"/figures/\"\n", "root_map = path+'/data/geo_shapefiles//NUTS_Level_3__January_2018__Boundaries-shp/NUTS_Level_3__January_2018__Boundaries.shp'\n", @@ -3798,54 +3798,6 @@ "see Lambiotte et al. 2009: belongs to the interval [0, 1], but is now an asymmetric quantity that vanishes only if each community of Pt is the union of communities of Pt" ] }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "def entropy(labels):\n", - " \"\"\"Calculates the entropy for a labeling.\n", - " Parameters\n", - " ----------\n", - " labels : int array, shape = [n_samples]\n", - " The labels\n", - " Notes\n", - " -----\n", - " The logarithm used is the natural logarithm (base-e).\n", - " \"\"\"\n", - " if len(labels) == 0:\n", - " return 1.0\n", - " label_idx = np.unique(labels, return_inverse=True)[1]\n", - " pi = np.bincount(label_idx).astype(np.float64)\n", - " pi = pi[pi > 0]\n", - " pi_sum = np.sum(pi)\n", - " # log(a / b) should be calculated as log(a) - log(b) for\n", - " # possible loss of precision\n", - " return -np.sum((pi / pi_sum) * (np.log(pi) - log(pi_sum)))\n", - "\n", - "def variation_of_information(x,y, normalised = True):\n", - " Ex = entropy(x)\n", - " Ey = entropy(y)\n", - " I = metrics.mutual_info_score(x,y)\n", - " \n", - " if normalised:\n", - " return (Ex + Ey - 2*I) / (Ex + Ey - I)\n", - " else: \n", - " return Ex + Ey - 2*I\n", - "\n", - "def normalised_conditional_entropy(x,y):\n", - " \"\"\"\n", - " H(X|Y) = H(X) - I(X,Y) and we normalise with log(N)\n", - " \"\"\"\n", - " \n", - " N = len(x)\n", - " Ex = entropy(x)\n", - " I = metrics.mutual_info_score(x,y)\n", - "\n", - " return (Ex - I) / np.log(N)" - ] - }, { "cell_type": "code", "execution_count": 38, diff --git a/src/utils.py b/src/utils.py index 3d1f68f..68d0db3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -48,3 +48,15 @@ def variation_of_information(x, y, normalised=True): return (Ex + Ey - 2 * I) / (Ex + Ey - I) else: return Ex + Ey - 2 * I + + +def normalised_conditional_entropy(x, y): + """ + H(X|Y) = H(X) - I(X,Y) and we normalise with log(N) + """ + + N = len(x) + Ex = entropy(x) + I = metrics.mutual_info_score(x, y) + + return (Ex - I) / np.log(N)