dpiras · dpiras · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ Once you installed GMM-MI, calculating the distribution of mutual information on
     mi_estimator = EstimateMI()
     MI_mean, MI_std = mi_estimator.fit_estimate(X)
 
-This yields (0.21 &pm; 0.04) nat, well in agreement with the theoretical value of 0.22 nat. There are many things that you can do: for example, you can also pass two 1D arrays instead of a single 2D array, and even calculate the KL divergence between the marginals (as shown in the walkthrough notebook). If you want to visualize the fitted model over your input data, you can run:
+This yields (0.21 &pm; 0.04) nat, well in agreement with the theoretical value of 0.22 nat. There are many things that you can do: for example, you can also pass two 1D arrays instead of a single 2D array, and even calculate the KL divergence between the marginals. With the same syntax, you can now also compute MI between multivariate variables, as we show in the walkthrough notebook. If you want to visualize the fitted model over your input data, you can run:
 
     import matplotlib.pyplot as plt
     fig, ax = plt.subplots(1, 1, figsize=(10, 10))

diff --git a/gmm_mi/cross_validation.py b/gmm_mi/cross_validation.py
@@ -71,7 +71,8 @@ def _run_cross_validation(self, X):
             w_init, m_init, c_init, p_init = initialize_parameters(X, random_state=random_state, 
                                                                    n_components=self.n_components, 
                                                                    init_type=self.init_type,
-                                                                   scale=self.scale)       
+                                                                   scale=self.scale,
+                                                                   reg_covar=self.reg_covar)       
             # perform k-fold CV
             for k_id, (train_indices, valid_indices) in enumerate(self.kf.split(X)):
                 X_training = X[train_indices]

diff --git a/gmm_mi/gmm.py b/gmm_mi/gmm.py
@@ -382,6 +382,38 @@ def estimate_cMI_MC(self, MC_samples=1e5):
         MI = np.mean(marginal_zs + joint - marginal_xz - marginal_yz)
         self.MI = MI
         return MI
+
+    def estimate_MI_MC_highdim(self, split, MC_samples=1e5):
+        """Compute the mutual information among multidimensional variables associated with a particular GMM model, 
+        using MC integration. 
+
+        Parameters
+        ----------
+        split : int
+            This integer controls which features belong to the first variable, and which ones belong to the second variable.
+        MC_samples : integer, default=1e5
+            Number of Monte Carlo (MC) samples to perform numerical integration of the MI integral.
+
+        Returns
+        -------
+        MI : float
+            The value of mutual information.
+        """
+        points, _ = self.sample(MC_samples)       
+        # evaluate the log-likelihood for the joint probability
+        joint = self.score_samples(points)
+        # and the marginals (x) and (y) too, but this time they are multidimensional       
+        gmm_x = GMMWithMI(n_components=self.n_components, weights_init=self.weights_,
+                     means_init=self.means_[:, :split], covariances_init=self.covariances_[:, :split, :split])
+        marginal_x = gmm_x.score_samples(points[:, :split])
+        # and for y       
+        gmm_y = GMMWithMI(n_components=self.n_components, weights_init=self.weights_,
+                     means_init=self.means_[:, split:], covariances_init=self.covariances_[:, split:, split:])
+        marginal_y = gmm_y.score_samples(points[:, split:])
+        # finally, we put everything together
+        MI = np.mean(joint - marginal_x - marginal_y)
+        self.MI = MI
+        return MI
 
     def estimate_KL_MC(self, kl_order='forward', MC_samples=1e5):
         """Compute the KL-divergence (KL) associated with a particular GMM model,

diff --git a/gmm_mi/initializations.py b/gmm_mi/initializations.py
@@ -206,7 +206,6 @@ def initialize_parameters(X, random_state=None, n_components=1, init_type='rando
         If scale is not given, it will be set such that the volume of all components completely fills the space covered by data.
     reg_covar : float, default=0
         Constant regularisation term added to the diagonal of each covariance matrix, to avoid singular matrices.
-        In GMM-MI, the regularisation term is added during training only, and not in the initializations.
 
     Returns
     ----------

diff --git a/gmm_mi/mi.py b/gmm_mi/mi.py
@@ -114,44 +114,53 @@ def __getattr__(self, attr):
             return getattr(self.mi_dist_params, attr)
         except:
             pass
+
+    # this is only to make the class pickable, to save/load MI estimator
+    def __getstate__(self): return self.__dict__
+    def __setstate__(self, d): self.__dict__.update(d)
 
     def _check_shapes(self, X, Y):
-        """ Check that the shapes of the arrays given as input to GMM-MI are either 2D or 1D,
-        and return the correct array to give as input.
+        """ Check that the shapes of the arrays given as input to GMM-MI are 2D, 1D,
+        or with the correct conditional structure. Return the correct array to give as input.
 
         Parameters
         ----------  
-        X : array-like of shape (n_samples, 2), (n_samples, 1), (n_samples) or (n_samples, 2+n_var)
+        X : array-like of shape (n_samples, n_features), with n_features taking multiple possibilities.
             Samples from the joint distribution of the two variables whose MI or KL is calculated.
-            If Y is None, must be of shape (n_samples, 2); otherwise, it must be either (n_samples, 1) or (n_samples).
-            The (n_samples, 2+n_var) case is for the conditional MI(X, Y | Z_1, ..., z_n), where conditional MI is computed.
-        Y : array-like of shape (n_samples, 1) or (n_samples), default=None
+            If Y is None, must be of shape (n_samples, 2) for standard MI, or (n_samples, 2+n_var) 
+            for conditional MI, or (n_samples, features_1 + features_2) for multidimensional variables. 
+            otherwise, it must be either (n_samples, 1) or (n_samples) for the standard case, 
+            or (n_samples, features_1) in the high-dimensional case.
+            The case (n_samples, 2+n_var) is reserved for the conditional MI(X, Y | Z_1, ..., Z_n); 
+            the variables must be given in this order.
+        Y : array-like of shape (n_samples, n_features), default=None
             Samples from the marginal distribution of one of the two variables whose MI or KL is calculated.
-            If None, X must be of shape (n_samples, 2); otherwise, X and Y must be (n_samples, 1) or (n_samples).
-
+            If None, all features must be already in X; otherwise, Y must be (n_samples, features_2).
+            
         Returns
         ----------
-        X : array-like of shape (n_samples, 2) or  (n_samples, 2+n_var)
+        X : array-like of shape (n_samples, n_features)
             The 2D (or higher-dimensional) array that is used to estimate MI or KL, with the expected shape.     
         """
         if len(X.shape) == 1:
             X = np.reshape(X, (X.shape[0], 1)) # add extra dimension       
         if Y is None:
             if X.shape[1] < 2:
                 raise ValueError(f"Y is None, but the input array X is not 2- (or higher-) dimensional. "\
-                     f"In this case, both X and Y should be 1-dimensional.")
+                     f"Please input a Y array or increase the dimensionality of X.")
             else:
                 return X
         # if Y is not None, we can manipulate it    
         else:
+            if X.shape[0] != Y.shape[0]:
+                raise ValueError(f"X and Y have different numbers of samples. Found {X.shape[0]} and {Y.shape[0]}.")
             if len(Y.shape) == 1:
                 Y = np.reshape(Y, (Y.shape[0], 1)) # add extra dimension
-            if X.shape[1] == 1 and Y.shape[1] == 1:
-                X = np.hstack((X, Y))
-                return X
-            else:
-                raise ValueError(f"Y is not None, but the input arrays X or Y are not 1-dimensional. "\
-                     f"Shapes found: {X.shape}, {Y.shape}.")
+            if not self.split:
+                self.split = X.shape[1]
+            X = np.hstack((X, Y))
+            return X
+
 
     def _select_best_metric(self, n_components):
         """Select best metric to choose the number of GMM components.
@@ -248,6 +257,7 @@ def _check_convergence(self, n_components):
             Number of GMM components being fitted. 
         """    
         if self.metric - self.best_metric > self.threshold_components:
+            self.patience_counter = 0 # reset patience counter
             self.best_metric = self.metric
             if self.verbose:
                 print(f'Current number of GMM components: {n_components}. Current metric: {self.best_metric:.3f}. Adding one component...')
@@ -281,18 +291,21 @@ def _calculate_MI(self, gmm, tol_int=1.49e-8, limit=np.inf):
             The value of MI, in the units specified by the base provided as input to the `estimate` method.
         """    
         if self.integral_method == 'MC':
-            if self.conditional == False:
-                MI = gmm.estimate_MI_MC(MC_samples=self.MC_samples)
-            else:
+            if self.conditional == True:
                 MI = gmm.estimate_cMI_MC(MC_samples=self.MC_samples)
+            elif self.split is not None:
+                MI = gmm.estimate_MI_MC_highdim(split=self.split, MC_samples=self.MC_samples)                
+            else:
+                MI = gmm.estimate_MI_MC(MC_samples=self.MC_samples)
         elif self.integral_method == 'quad':
-            if self.conditional == False:
+            if self.conditional == False and self.split is None:
                 MI = gmm.estimate_MI_quad(tol_int=tol_int, limit=limit)
             else:
                 raise NotImplementedError(
-                    "Estimating conditional MI with quadrature methods is currently not implemented. "
+                    "Estimating conditional MI, or MI between multivariate variables, with quadrature methods "
+                    "is currently not implemented. "
                     "Please use `MC` method, or write to dr.davide.piras@gmail.com if you would like "
-                    "this feature added."
+                    "this feature added, or raise an issue here: https://github.com/dpiras/GMM-MI/issues."
                     )          
         return MI
 
@@ -413,22 +426,28 @@ def _set_units(self, MI_mean, MI_std, base):
             MI_std /= np.log(base)
         return MI_mean, MI_std
 
-    def fit(self, X, Y=None, verbose=False):
+    def fit(self, X, Y=None, conditional=False, split=None, verbose=False):
         """Performs density estimation of the data using GMMs and k-fold cross-validation.
         The fitted model will be used to estimate MI and/or KL.
 
         Parameters
         ----------  
-        X : array-like of shape (n_samples, 2), (n_samples, 1), (n_samples) or (n_samples, 3)
+        X : array-like of shape (n_samples, n_features), with n_features taking multiple possibilities.
             Samples from the joint distribution of the two variables whose MI or KL is calculated.
-            If Y is None, must be of shape (n_samples, 2) or (n_samples, 3); 
-            otherwise, it must be either (n_samples, 1) or (n_samples).
+            If Y is None, must be of shape (n_samples, 2) for standard MI, or (n_samples, 2+n_var) 
+            for conditional MI, or (n_samples, features_1 + features_2) for multidimensional variables. 
+            otherwise, it must be either (n_samples, 1) or (n_samples) for the standard case, 
+            or (n_samples, features_1) in the high-dimensional case.
             The case (n_samples, 2+n_var) is reserved for the conditional MI(X, Y | Z_1, ..., Z_n); 
             the variables must be given in this order.
-        Y : array-like of shape (n_samples, 1) or (n_samples), default=None
+        Y : array-like of shape (n_samples, n_features), default=None
             Samples from the marginal distribution of one of the two variables whose MI or KL is calculated.
-            If None, X must be of shape (n_samples, 2) or (n_samples, 2+n_var); 
-            otherwise, X and Y must be (n_samples, 1) or (n_samples).                   
+            If None, all features must be already in X; otherwise, Y must be (n_samples, features_2).
+        conditional : bool, default=False
+            Whether to compute conditional MI between unidimensional variables given a set of other unidimensional variables.
+        split : int, default=None
+            When computing MI between multidimensional variables, this integer controls which features belong to
+            the first variable, and which ones belong to the second variable.
         verbose : bool, default=False
             Whether to print useful procedural statements.
 
@@ -437,6 +456,8 @@ def fit(self, X, Y=None, verbose=False):
         None
         """ 
         self.verbose = verbose
+        self.conditional = conditional
+        self.split = split
         # if fit was already done, exit without doing anything
         if self.fit_done:
             if self.verbose:
@@ -447,11 +468,28 @@ def fit(self, X, Y=None, verbose=False):
         X = self._check_shapes(X, Y)
         self.X = X
         if X.shape[1] >= 3:
-            self.conditional = True
-            if self.verbose:
-                print('Shape of input array is 3-D or higher, so conditional mutual information '
-                      'will be computed.')
-
+            if self.conditional:
+                if self.split:
+                    raise NotImplementedError("Both conditional and a split were indicated, but conditional MI "\
+                                              "between multidimensional variables is not implemented yet. "\
+                                              "Feel free to open an issue here if you need it: "\
+                                              "\https://github.com/dpiras/GMM-MI/issues.")
+                if self.verbose:
+                    print('Shape of input array is 3-D or higher and conditional=True, so conditional  '
+                          'mutual information will be computed. Any split will be ignored. '
+                          'Warning: this feature is experimental and has not been thoroughly tested; '
+                          'make sure the data distribution is captured properly.')
+            else:
+                if self.split:
+                    if self.verbose:
+                        print('Shape of input array is 3-D or higher and a split was indicated, so '
+                              'mutual information between multivariate variables will be computed. '
+                              'Warning: this feature is experimental and has not been thoroughly tested; '
+                              'make sure the data distribution is captured properly.')   
+                else:
+                     raise ValueError(f"Shape of input array is 3-D or higher, but nor conditional nor a split was indicated. "\
+                     f"Please indicate one of the two to proceed with the fit, or include a Y array to infer the split value.")
+
         if self.verbose:
             if not self.fixed_components:
                 print('Starting cross-validation procedure to select the number of GMM components...')
@@ -593,11 +631,13 @@ def estimate(self, mi_dist_params=None, base=np.exp(1), include_kl=False, kl_ord
         self.KL_mean, self.KL_std = self._set_units(self.KL_mean, self.KL_std, base)
         return MI_mean, MI_std    
 
-    def fit_estimate(self, X, Y=None, mi_dist_params=None, include_kl=False, kl_order='forward', base=np.exp(1), verbose=False):
+    def fit_estimate(self, X, Y=None, conditional=False, split=None,
+                     mi_dist_params=None, include_kl=False, kl_order='forward', 
+                     base=np.exp(1), verbose=False):
         """Combine the `fit` and `estimate` methods for easier calculation of MI. 
         See the respective methods for all information.
         """ 
-        self.fit(X=X, Y=Y, verbose=verbose)
+        self.fit(X=X, Y=Y, conditional=conditional, split=split, verbose=verbose)
         MI_mean, MI_std = self.estimate(mi_dist_params=mi_dist_params, include_kl=include_kl, 
                                        kl_order=kl_order, base=base, verbose=verbose)
         return MI_mean, MI_std            
@@ -684,7 +724,7 @@ def plot_fitted_contours(self, parameters=None, extents=None, n_samples=None, **
     def _calculate_MI_categorical(self):
         """Calculate mutual information (MI) integral given a Gaussian mixture model in 2D.
         Use only Monte Carlo (MC) method. 
-        The complete formula can be found in Appendix B of Piras et al. (2022).
+        The complete formula can be found in Appendix B of Piras et al. (2023).
 
         Returns
         -------

diff --git a/notebooks/walkthrough_and_pitfalls.ipynb b/notebooks/walkthrough_and_pitfalls.ipynb
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='gmm_mi',
-    version="0.8.0",
+    version="0.9.0",
     author='Davide Piras',
     author_email='dr.davide.piras@gmail.com',
     description='Estimate mutual information distribution with Gaussian mixture models',