Skip to content

Commit

Permalink
Deploying to gh-pages from @ 4a4aab3 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
EtienneCmb committed Aug 30, 2024
1 parent 92cccbd commit 6a6ca76
Show file tree
Hide file tree
Showing 156 changed files with 3,044 additions and 736 deletions.
2 changes: 1 addition & 1 deletion .buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 592af5c3e32f0ec823a47e0abd75e30d
config: 999930e67a31d0b6695cfec21bc21d1a
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Comparison of entropy estimators with high-dimensional data\n\nIn this example, we are going to compare estimators of entropy with\nhigh-dimensional data.\n\n1. Simulate data sampled from a multivariate normal distribution.\n2. Define estimators of entropy.\n3. Compute the entropy for a varying number of samples.\n4. See if the estimated entropy converge towards the theoretical value.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n\nfrom hoi.core import get_entropy\n\nimport matplotlib.pyplot as plt\n\nplt.style.use(\"ggplot\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Definition of entropy estimators\n\nWe are going to use the GCMI (Gaussian Copula Mutual Information), KNN\n(k Nearest Neighbor) and a Gaussian kernel-based estimator.\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# list of estimators to compare\nmetrics = {\n \"GCMI\": get_entropy(\"gc\", biascorrect=False),\n \"KNN-3\": get_entropy(\"knn\", k=3),\n \"KNN-10\": get_entropy(\"knn\", k=10),\n \"Kernel\": get_entropy(\"kernel\"),\n}\n\n# number of samples to simulate data\nn_samples = np.geomspace(100, 10000, 10).astype(int)\n\n# number of repetitions to estimate the percentile interval\nn_repeat = 10\n\n\n# plotting function\ndef plot(ent, ent_theoric, ax):\n \"\"\"Plotting function.\"\"\"\n for n_m, metric_name in enumerate(ent.keys()):\n # get the entropies\n x = ent[metric_name]\n\n # get the color\n color = f\"C{n_m}\"\n\n # estimate lower and upper bounds of the [5, 95]th percentile interval\n x_low, x_high = np.percentile(x, [5, 95], axis=0)\n\n # plot the MI as a function of the number of samples and interval\n ax.plot(n_samples, x.mean(0), color=color, lw=2, label=metric_name)\n ax.fill_between(n_samples, x_low, x_high, color=color, alpha=0.2)\n\n # plot the theoretical value\n ax.axhline(\n ent_theoric, linestyle=\"--\", color=\"k\", label=\"Theoretical entropy\"\n )\n ax.legend()\n ax.set_xlabel(\"Number of samples\")\n ax.set_ylabel(\"Entropy [bits]\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Entropy of data sampled from multinormal distribution\n\nLet variables $X_1,X_2,...,X_n$ have a multivariate normal distribution\n$\\mathcal{N}(\\vec{\\mu}, \\Sigma)$ the theoretical entropy in bits is\ndefined by :\n\n\\begin{align}H(X) = \\frac{1}{2} \\times log_{2}({|\\Sigma|}(2\\pi e)^{n})\\end{align}\n\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# function for creating the covariance matrix with differnt modes\ndef create_cov_matrix(n_dims, cov, mode=\"dense\", k=None):\n \"\"\"Create a covariance matrix.\"\"\"\n # variance 1 for each dim\n cov_matrix = np.eye(n_dims)\n if mode == \"dense\":\n # all dimensions, but diagonal, with covariance cov\n cov_matrix += cov\n cov_matrix[np.diag_indices(n_dims)] = 1\n elif mode == \"sparse\":\n # only pairs x_i, x_(i+1) with i < k have covariance cov\n k = k if k is not None else n_dims\n for i in range(n_dims - 1):\n if i < k:\n cov_matrix[i, i + 1] = cov\n cov_matrix[i + 1, i] = cov\n\n return cov_matrix\n\n\ndef compute_true_entropy(cov_matrix):\n \"\"\"Compute the true entropy (bits).\"\"\"\n n_dims = cov_matrix.shape[0]\n det_cov = np.linalg.det(cov_matrix)\n return 0.5 * np.log2(det_cov * (2 * np.pi * np.e) ** n_dims)\n\n\n# number of dimensions per variable\nn_dims = 4\n# mean\nmu = [0.0] * n_dims\n# covariance\ncovariance = 0.6\n\n# modes for the covariance matrix:\n# - dense: off diagonal elements have specified covariance\n# - sparse: only pairs xi, x_(i+1) with i < k have specified covariance\nmodes = [\"dense\", \"sparse\"]\n# number of pairs with specified covariance\nk = n_dims\n\nfig = plt.figure(figsize=(10, 5))\n# compute entropy using various metrics\nentropy = {k: np.zeros((n_repeat, len(n_samples))) for k in metrics.keys()}\nfor i, mode in enumerate(modes):\n cov_matrix = create_cov_matrix(n_dims, covariance, mode=mode)\n # define the theoretic entropy\n ent_theoric = compute_true_entropy(cov_matrix)\n ax = fig.add_subplot(1, 2, i + 1)\n\n for n_s, s in enumerate(n_samples):\n for n_r in range(n_repeat):\n # generate samples from joint gaussian distribution\n fx = np.random.multivariate_normal(mu, cov_matrix, s)\n for metric, fcn in metrics.items():\n # extract x and y\n x = fx[:, :n_dims].T\n y = fx[:, n_dims:].T\n # compute entropy\n entropy[metric][n_r, n_s] = fcn(x)\n\n # plot the results\n plot(entropy, ent_theoric, ax)\n ax.title.set_text(f\"Mode: {mode}\")\n\nfig.suptitle(\n \"Comparison of entropy estimators when\\nthe data is high-dimensional\",\n fontweight=\"bold\",\n)\nfig.tight_layout()\nplt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 8 additions & 8 deletions _downloads/7aa0a86e987814b09694d9eac79452ec/plot_mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def mi_binning(x, y, **kwargs):
# list of estimators to compare
metrics = {
"GC": get_mi("gc", biascorrect=False),
"KNN-3": get_mi("knn", k=1),
"KNN-3": get_mi("knn", k=3),
"KNN-10": get_mi("knn", k=10),
"Kernel": get_mi("kernel"),
"Binning": partial(mi_binning, n_bins=4),
Expand All @@ -59,7 +59,7 @@ def mi_binning(x, y, **kwargs):


# plotting function
def plot(mi, h_theoric):
def plot(mi, mi_theoric):
"""Plotting function."""
for n_m, metric_name in enumerate(mi.keys()):
# get the entropies
Expand All @@ -76,7 +76,7 @@ def plot(mi, h_theoric):
plt.fill_between(n_samples, x_low, x_high, color=color, alpha=0.2)

# plot the theoretical value
plt.axhline(h_theoric, linestyle="--", color="k", label="Theoretical MI")
plt.axhline(mi_theoric, linestyle="--", color="k", label="Theoretical MI")
plt.legend()
plt.xlabel("Number of samples")
plt.ylabel("Mutual-information [bits]")
Expand All @@ -101,7 +101,7 @@ def plot(mi, h_theoric):
sigma_y = 1.0

# covariance between x and y
covariance = 0.3
covariance = 0.5

# covariance matrix
cov_matrix = [[sigma_x**2, covariance], [covariance, sigma_y**2]]
Expand All @@ -111,12 +111,12 @@ def plot(mi, h_theoric):
sigma_x**2 * sigma_y**2 / (sigma_x**2 * sigma_y**2 - covariance**2)
)

# compute entropies using various metrics
# compute mi using various metrics
mi = {k: np.zeros((n_repeat, len(n_samples))) for k in metrics.keys()}

for metric, fcn in metrics.items():
for n_s, s in enumerate(n_samples):
for n_r in range(n_repeat):
for n_s, s in enumerate(n_samples):
for n_r in range(n_repeat):
for metric, fcn in metrics.items():
# generate samples from joint gaussian distribution
fx = np.random.multivariate_normal([mu_x, mu_y], cov_matrix, s)

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""
Comparison of entropy estimators with high-dimensional data
=====================================================================
In this example, we are going to compare estimators of entropy with
high-dimensional data.
1. Simulate data sampled from a multivariate normal distribution.
2. Define estimators of entropy.
3. Compute the entropy for a varying number of samples.
4. See if the estimated entropy converge towards the theoretical value.
"""

import numpy as np

from hoi.core import get_entropy

import matplotlib.pyplot as plt

plt.style.use("ggplot")

###############################################################################
# Definition of entropy estimators
# ---------------------------
#
# We are going to use the GCMI (Gaussian Copula Mutual Information), KNN
# (k Nearest Neighbor) and a Gaussian kernel-based estimator.

# list of estimators to compare
metrics = {
"GCMI": get_entropy("gc", biascorrect=False),
"KNN-3": get_entropy("knn", k=3),
"KNN-10": get_entropy("knn", k=10),
"Kernel": get_entropy("kernel"),
}

# number of samples to simulate data
n_samples = np.geomspace(100, 10000, 10).astype(int)

# number of repetitions to estimate the percentile interval
n_repeat = 10


# plotting function
def plot(ent, ent_theoric, ax):
"""Plotting function."""
for n_m, metric_name in enumerate(ent.keys()):
# get the entropies
x = ent[metric_name]

# get the color
color = f"C{n_m}"

# estimate lower and upper bounds of the [5, 95]th percentile interval
x_low, x_high = np.percentile(x, [5, 95], axis=0)

# plot the MI as a function of the number of samples and interval
ax.plot(n_samples, x.mean(0), color=color, lw=2, label=metric_name)
ax.fill_between(n_samples, x_low, x_high, color=color, alpha=0.2)

# plot the theoretical value
ax.axhline(
ent_theoric, linestyle="--", color="k", label="Theoretical entropy"
)
ax.legend()
ax.set_xlabel("Number of samples")
ax.set_ylabel("Entropy [bits]")


###############################################################################
# Entropy of data sampled from multinormal distribution
# -------------------------------------------
#
# Let variables :math:`X_1,X_2,...,X_n` have a multivariate normal distribution
# :math:`\mathcal{N}(\vec{\mu}, \Sigma)` the theoretical entropy in bits is
# defined by :
#
# .. math::
# H(X) = \frac{1}{2} \times log_{2}({|\Sigma|}(2\pi e)^{n})
#


# function for creating the covariance matrix with differnt modes
def create_cov_matrix(n_dims, cov, mode="dense", k=None):
"""Create a covariance matrix."""
# variance 1 for each dim
cov_matrix = np.eye(n_dims)
if mode == "dense":
# all dimensions, but diagonal, with covariance cov
cov_matrix += cov
cov_matrix[np.diag_indices(n_dims)] = 1
elif mode == "sparse":
# only pairs x_i, x_(i+1) with i < k have covariance cov
k = k if k is not None else n_dims
for i in range(n_dims - 1):
if i < k:
cov_matrix[i, i + 1] = cov
cov_matrix[i + 1, i] = cov

return cov_matrix


def compute_true_entropy(cov_matrix):
"""Compute the true entropy (bits)."""
n_dims = cov_matrix.shape[0]
det_cov = np.linalg.det(cov_matrix)
return 0.5 * np.log2(det_cov * (2 * np.pi * np.e) ** n_dims)


# number of dimensions per variable
n_dims = 4
# mean
mu = [0.0] * n_dims
# covariance
covariance = 0.6

# modes for the covariance matrix:
# - dense: off diagonal elements have specified covariance
# - sparse: only pairs xi, x_(i+1) with i < k have specified covariance
modes = ["dense", "sparse"]
# number of pairs with specified covariance
k = n_dims

fig = plt.figure(figsize=(10, 5))
# compute entropy using various metrics
entropy = {k: np.zeros((n_repeat, len(n_samples))) for k in metrics.keys()}
for i, mode in enumerate(modes):
cov_matrix = create_cov_matrix(n_dims, covariance, mode=mode)
# define the theoretic entropy
ent_theoric = compute_true_entropy(cov_matrix)
ax = fig.add_subplot(1, 2, i + 1)

for n_s, s in enumerate(n_samples):
for n_r in range(n_repeat):
# generate samples from joint gaussian distribution
fx = np.random.multivariate_normal(mu, cov_matrix, s)
for metric, fcn in metrics.items():
# extract x and y
x = fx[:, :n_dims].T
y = fx[:, n_dims:].T
# compute entropy
entropy[metric][n_r, n_s] = fcn(x)

# plot the results
plot(entropy, ent_theoric, ax)
ax.title.set_text(f"Mode: {mode}")

fig.suptitle(
"Comparison of entropy estimators when\nthe data is high-dimensional",
fontweight="bold",
)
fig.tight_layout()
plt.show()
Loading

0 comments on commit 6a6ca76

Please sign in to comment.