Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend GMM-MI to work with multivariate random variables, too #18

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Once you installed GMM-MI, calculating the distribution of mutual information on
mi_estimator = EstimateMI()
MI_mean, MI_std = mi_estimator.fit_estimate(X)

This yields (0.21 ± 0.04) nat, well in agreement with the theoretical value of 0.22 nat. There are many things that you can do: for example, you can also pass two 1D arrays instead of a single 2D array, and even calculate the KL divergence between the marginals (as shown in the walkthrough notebook). If you want to visualize the fitted model over your input data, you can run:
This yields (0.21 ± 0.04) nat, well in agreement with the theoretical value of 0.22 nat. There are many things that you can do: for example, you can also pass two 1D arrays instead of a single 2D array, and even calculate the KL divergence between the marginals. With the same syntax, you can now also compute MI between multivariate variables, as we show in the walkthrough notebook. If you want to visualize the fitted model over your input data, you can run:

import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
Expand Down
3 changes: 2 additions & 1 deletion gmm_mi/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def _run_cross_validation(self, X):
w_init, m_init, c_init, p_init = initialize_parameters(X, random_state=random_state,
n_components=self.n_components,
init_type=self.init_type,
scale=self.scale)
scale=self.scale,
reg_covar=self.reg_covar)
# perform k-fold CV
for k_id, (train_indices, valid_indices) in enumerate(self.kf.split(X)):
X_training = X[train_indices]
Expand Down
32 changes: 32 additions & 0 deletions gmm_mi/gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,38 @@ def estimate_cMI_MC(self, MC_samples=1e5):
MI = np.mean(marginal_zs + joint - marginal_xz - marginal_yz)
self.MI = MI
return MI

def estimate_MI_MC_highdim(self, split, MC_samples=1e5):
"""Compute the mutual information among multidimensional variables associated with a particular GMM model,
using MC integration.

Parameters
----------
split : int
This integer controls which features belong to the first variable, and which ones belong to the second variable.
MC_samples : integer, default=1e5
Number of Monte Carlo (MC) samples to perform numerical integration of the MI integral.

Returns
-------
MI : float
The value of mutual information.
"""
points, _ = self.sample(MC_samples)
# evaluate the log-likelihood for the joint probability
joint = self.score_samples(points)
# and the marginals (x) and (y) too, but this time they are multidimensional
gmm_x = GMMWithMI(n_components=self.n_components, weights_init=self.weights_,
means_init=self.means_[:, :split], covariances_init=self.covariances_[:, :split, :split])
marginal_x = gmm_x.score_samples(points[:, :split])
# and for y
gmm_y = GMMWithMI(n_components=self.n_components, weights_init=self.weights_,
means_init=self.means_[:, split:], covariances_init=self.covariances_[:, split:, split:])
marginal_y = gmm_y.score_samples(points[:, split:])
# finally, we put everything together
MI = np.mean(joint - marginal_x - marginal_y)
self.MI = MI
return MI

def estimate_KL_MC(self, kl_order='forward', MC_samples=1e5):
"""Compute the KL-divergence (KL) associated with a particular GMM model,
Expand Down
1 change: 0 additions & 1 deletion gmm_mi/initializations.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ def initialize_parameters(X, random_state=None, n_components=1, init_type='rando
If scale is not given, it will be set such that the volume of all components completely fills the space covered by data.
reg_covar : float, default=0
Constant regularisation term added to the diagonal of each covariance matrix, to avoid singular matrices.
In GMM-MI, the regularisation term is added during training only, and not in the initializations.

Returns
----------
Expand Down
114 changes: 77 additions & 37 deletions gmm_mi/mi.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,44 +114,53 @@ def __getattr__(self, attr):
return getattr(self.mi_dist_params, attr)
except:
pass

# this is only to make the class pickable, to save/load MI estimator
def __getstate__(self): return self.__dict__
def __setstate__(self, d): self.__dict__.update(d)

def _check_shapes(self, X, Y):
""" Check that the shapes of the arrays given as input to GMM-MI are either 2D or 1D,
and return the correct array to give as input.
""" Check that the shapes of the arrays given as input to GMM-MI are 2D, 1D,
or with the correct conditional structure. Return the correct array to give as input.

Parameters
----------
X : array-like of shape (n_samples, 2), (n_samples, 1), (n_samples) or (n_samples, 2+n_var)
X : array-like of shape (n_samples, n_features), with n_features taking multiple possibilities.
Samples from the joint distribution of the two variables whose MI or KL is calculated.
If Y is None, must be of shape (n_samples, 2); otherwise, it must be either (n_samples, 1) or (n_samples).
The (n_samples, 2+n_var) case is for the conditional MI(X, Y | Z_1, ..., z_n), where conditional MI is computed.
Y : array-like of shape (n_samples, 1) or (n_samples), default=None
If Y is None, must be of shape (n_samples, 2) for standard MI, or (n_samples, 2+n_var)
for conditional MI, or (n_samples, features_1 + features_2) for multidimensional variables.
otherwise, it must be either (n_samples, 1) or (n_samples) for the standard case,
or (n_samples, features_1) in the high-dimensional case.
The case (n_samples, 2+n_var) is reserved for the conditional MI(X, Y | Z_1, ..., Z_n);
the variables must be given in this order.
Y : array-like of shape (n_samples, n_features), default=None
Samples from the marginal distribution of one of the two variables whose MI or KL is calculated.
If None, X must be of shape (n_samples, 2); otherwise, X and Y must be (n_samples, 1) or (n_samples).

If None, all features must be already in X; otherwise, Y must be (n_samples, features_2).
Returns
----------
X : array-like of shape (n_samples, 2) or (n_samples, 2+n_var)
X : array-like of shape (n_samples, n_features)
The 2D (or higher-dimensional) array that is used to estimate MI or KL, with the expected shape.
"""
if len(X.shape) == 1:
X = np.reshape(X, (X.shape[0], 1)) # add extra dimension
if Y is None:
if X.shape[1] < 2:
raise ValueError(f"Y is None, but the input array X is not 2- (or higher-) dimensional. "\
f"In this case, both X and Y should be 1-dimensional.")
f"Please input a Y array or increase the dimensionality of X.")
else:
return X
# if Y is not None, we can manipulate it
else:
if X.shape[0] != Y.shape[0]:
raise ValueError(f"X and Y have different numbers of samples. Found {X.shape[0]} and {Y.shape[0]}.")
if len(Y.shape) == 1:
Y = np.reshape(Y, (Y.shape[0], 1)) # add extra dimension
if X.shape[1] == 1 and Y.shape[1] == 1:
X = np.hstack((X, Y))
return X
else:
raise ValueError(f"Y is not None, but the input arrays X or Y are not 1-dimensional. "\
f"Shapes found: {X.shape}, {Y.shape}.")
if not self.split:
self.split = X.shape[1]
X = np.hstack((X, Y))
return X


def _select_best_metric(self, n_components):
"""Select best metric to choose the number of GMM components.
Expand Down Expand Up @@ -248,6 +257,7 @@ def _check_convergence(self, n_components):
Number of GMM components being fitted.
"""
if self.metric - self.best_metric > self.threshold_components:
self.patience_counter = 0 # reset patience counter
self.best_metric = self.metric
if self.verbose:
print(f'Current number of GMM components: {n_components}. Current metric: {self.best_metric:.3f}. Adding one component...')
Expand Down Expand Up @@ -281,18 +291,21 @@ def _calculate_MI(self, gmm, tol_int=1.49e-8, limit=np.inf):
The value of MI, in the units specified by the base provided as input to the `estimate` method.
"""
if self.integral_method == 'MC':
if self.conditional == False:
MI = gmm.estimate_MI_MC(MC_samples=self.MC_samples)
else:
if self.conditional == True:
MI = gmm.estimate_cMI_MC(MC_samples=self.MC_samples)
elif self.split is not None:
MI = gmm.estimate_MI_MC_highdim(split=self.split, MC_samples=self.MC_samples)
else:
MI = gmm.estimate_MI_MC(MC_samples=self.MC_samples)
elif self.integral_method == 'quad':
if self.conditional == False:
if self.conditional == False and self.split is None:
MI = gmm.estimate_MI_quad(tol_int=tol_int, limit=limit)
else:
raise NotImplementedError(
"Estimating conditional MI with quadrature methods is currently not implemented. "
"Estimating conditional MI, or MI between multivariate variables, with quadrature methods "
"is currently not implemented. "
"Please use `MC` method, or write to dr.davide.piras@gmail.com if you would like "
"this feature added."
"this feature added, or raise an issue here: https://github.com/dpiras/GMM-MI/issues."
)
return MI

Expand Down Expand Up @@ -413,22 +426,28 @@ def _set_units(self, MI_mean, MI_std, base):
MI_std /= np.log(base)
return MI_mean, MI_std

def fit(self, X, Y=None, verbose=False):
def fit(self, X, Y=None, conditional=False, split=None, verbose=False):
"""Performs density estimation of the data using GMMs and k-fold cross-validation.
The fitted model will be used to estimate MI and/or KL.

Parameters
----------
X : array-like of shape (n_samples, 2), (n_samples, 1), (n_samples) or (n_samples, 3)
X : array-like of shape (n_samples, n_features), with n_features taking multiple possibilities.
Samples from the joint distribution of the two variables whose MI or KL is calculated.
If Y is None, must be of shape (n_samples, 2) or (n_samples, 3);
otherwise, it must be either (n_samples, 1) or (n_samples).
If Y is None, must be of shape (n_samples, 2) for standard MI, or (n_samples, 2+n_var)
for conditional MI, or (n_samples, features_1 + features_2) for multidimensional variables.
otherwise, it must be either (n_samples, 1) or (n_samples) for the standard case,
or (n_samples, features_1) in the high-dimensional case.
The case (n_samples, 2+n_var) is reserved for the conditional MI(X, Y | Z_1, ..., Z_n);
the variables must be given in this order.
Y : array-like of shape (n_samples, 1) or (n_samples), default=None
Y : array-like of shape (n_samples, n_features), default=None
Samples from the marginal distribution of one of the two variables whose MI or KL is calculated.
If None, X must be of shape (n_samples, 2) or (n_samples, 2+n_var);
otherwise, X and Y must be (n_samples, 1) or (n_samples).
If None, all features must be already in X; otherwise, Y must be (n_samples, features_2).
conditional : bool, default=False
Whether to compute conditional MI between unidimensional variables given a set of other unidimensional variables.
split : int, default=None
When computing MI between multidimensional variables, this integer controls which features belong to
the first variable, and which ones belong to the second variable.
verbose : bool, default=False
Whether to print useful procedural statements.

Expand All @@ -437,6 +456,8 @@ def fit(self, X, Y=None, verbose=False):
None
"""
self.verbose = verbose
self.conditional = conditional
self.split = split
# if fit was already done, exit without doing anything
if self.fit_done:
if self.verbose:
Expand All @@ -447,11 +468,28 @@ def fit(self, X, Y=None, verbose=False):
X = self._check_shapes(X, Y)
self.X = X
if X.shape[1] >= 3:
self.conditional = True
if self.verbose:
print('Shape of input array is 3-D or higher, so conditional mutual information '
'will be computed.')

if self.conditional:
if self.split:
raise NotImplementedError("Both conditional and a split were indicated, but conditional MI "\
"between multidimensional variables is not implemented yet. "\
"Feel free to open an issue here if you need it: "\
"\https://github.com/dpiras/GMM-MI/issues.")
if self.verbose:
print('Shape of input array is 3-D or higher and conditional=True, so conditional '
'mutual information will be computed. Any split will be ignored. '
'Warning: this feature is experimental and has not been thoroughly tested; '
'make sure the data distribution is captured properly.')
else:
if self.split:
if self.verbose:
print('Shape of input array is 3-D or higher and a split was indicated, so '
'mutual information between multivariate variables will be computed. '
'Warning: this feature is experimental and has not been thoroughly tested; '
'make sure the data distribution is captured properly.')
else:
raise ValueError(f"Shape of input array is 3-D or higher, but nor conditional nor a split was indicated. "\
f"Please indicate one of the two to proceed with the fit, or include a Y array to infer the split value.")

if self.verbose:
if not self.fixed_components:
print('Starting cross-validation procedure to select the number of GMM components...')
Expand Down Expand Up @@ -593,11 +631,13 @@ def estimate(self, mi_dist_params=None, base=np.exp(1), include_kl=False, kl_ord
self.KL_mean, self.KL_std = self._set_units(self.KL_mean, self.KL_std, base)
return MI_mean, MI_std

def fit_estimate(self, X, Y=None, mi_dist_params=None, include_kl=False, kl_order='forward', base=np.exp(1), verbose=False):
def fit_estimate(self, X, Y=None, conditional=False, split=None,
mi_dist_params=None, include_kl=False, kl_order='forward',
base=np.exp(1), verbose=False):
"""Combine the `fit` and `estimate` methods for easier calculation of MI.
See the respective methods for all information.
"""
self.fit(X=X, Y=Y, verbose=verbose)
self.fit(X=X, Y=Y, conditional=conditional, split=split, verbose=verbose)
MI_mean, MI_std = self.estimate(mi_dist_params=mi_dist_params, include_kl=include_kl,
kl_order=kl_order, base=base, verbose=verbose)
return MI_mean, MI_std
Expand Down Expand Up @@ -684,7 +724,7 @@ def plot_fitted_contours(self, parameters=None, extents=None, n_samples=None, **
def _calculate_MI_categorical(self):
"""Calculate mutual information (MI) integral given a Gaussian mixture model in 2D.
Use only Monte Carlo (MC) method.
The complete formula can be found in Appendix B of Piras et al. (2022).
The complete formula can be found in Appendix B of Piras et al. (2023).

Returns
-------
Expand Down
248 changes: 220 additions & 28 deletions notebooks/walkthrough_and_pitfalls.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='gmm_mi',
version="0.8.0",
version="0.9.0",
author='Davide Piras',
author_email='dr.davide.piras@gmail.com',
description='Estimate mutual information distribution with Gaussian mixture models',
Expand Down