Merge pull request #23 from fdtomasi/develop

Datasets fix and tests addition
fdtomasi · Mar 7, 2020 · 3f99857 · 3f99857
2 parents 75a918d + 2de985d
commit 3f99857
Show file tree

Hide file tree

Showing 8 changed files with 181 additions and 117 deletions.
diff --git a/regain/__init__.py b/regain/__init__.py
@@ -28,4 +28,4 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = '0.2.2'
+__version__ = '0.2.3'
diff --git a/regain/datasets/base.py b/regain/datasets/base.py
@@ -50,17 +50,10 @@ def _gaussian_case(
         normalize_starting_matrices=False, degree=2, epsilon=1e-2,
         keep_sparsity=False, proportional=False, **kwargs):
     modes = dict(
-        my=data_Meinshausen_Yuan,
-        mys=data_Meinshausen_Yuan_sparse_latent,
-        sin=make_sin,
-        fixed_sparsity=make_fixed_sparsity,
-        sincos=make_sin_cos,
-        gp=make_exp_sine_squared,
-        fede=make_fede,
-        sklearn=make_sparse_low_rank,
-        ma=make_ma_xue_zou,
-        mak=make_ma_xue_zou_rand_k,
-        ticc=make_ticc)
+        my=data_Meinshausen_Yuan, mys=data_Meinshausen_Yuan_sparse_latent,
+        sin=make_sin, fixed_sparsity=make_fixed_sparsity, sincos=make_sin_cos,
+        gp=make_exp_sine_squared, fede=make_fede, sklearn=make_sparse_low_rank,
+        ma=make_ma_xue_zou, mak=make_ma_xue_zou_rand_k, ticc=make_ticc)
 
     if mode is not None:
         # mode overrides other parameters, for back compatibility
@@ -84,7 +77,7 @@ def _gaussian_case(
     #         normalize_starting_matrices=normalize_starting_matrices,
     #         degree=degree, epsilon=epsilon, keep_sparsity=keep_sparsity,
     #         proportional=proportional)
-    thetas, thetas_obs, ells = func(n_dim_obs, n_dim_lat, T, **kwargs)
+    thetas, thetas_obs, ells = func(n_dim_obs=n_dim_obs, n_dim_lat=n_dim_lat, T=T, **kwargs)
     sigmas = list(map(np.linalg.inv, thetas_obs))
     # map(normalize_matrix, sigmas)  # in place
 
@@ -114,9 +107,11 @@ def _ising_case(
         for t in thetas
     ]
     data = np.array(samples)
+    X = np.vstack(data)
+    y = np.repeat(range(len(thetas)), n_samples).astype(int)
     if time_on_axis == "last":
         data = data.transpose(1, 2, 0)
-    return data, thetas
+    return Bunch(data=data, thetas=np.array(thetas), X=X, y=y)
 
 
 def _poisson_case(
@@ -129,9 +124,11 @@ def _poisson_case(
         for t in thetas
     ]
     data = np.array(samples)
+    X = np.vstack(data)
+    y = np.repeat(range(len(thetas)), n_samples).astype(int)
     if time_on_axis == "last":
         data = data.transpose(1, 2, 0)
-    return data, thetas
+    return Bunch(data=data, thetas=np.array(thetas), X=X, y=y)
 
 
 def make_dataset(

diff --git a/regain/datasets/gaussian.py b/regain/datasets/gaussian.py
@@ -65,13 +65,7 @@ def _permute_locations(locations, random_state):
     return new_locations
 
 
-def data_Meinshausen_Yuan(
-        p=198, h=2, n=200, T=10, random_state=None, **kwargs):
-    random_state = check_random_state(random_state)
-    nodes_locations = []
-    for i in range(p):
-        nodes_locations.append(list(random_state.uniform(size=2)))
-    probs = _compute_probabilities(nodes_locations, random_state)
+def _theta_my(probs, p, random_state, value=0.245):
     theta = np.zeros((p, p))
     for i in range(p):
         ix = np.argsort(probs[i, :])[::-1]
@@ -86,18 +80,26 @@ def data_Meinshausen_Yuan(
             ixs.append(ix[j])
             sel += 1
             j += 1
-        theta[i, ixs] = 0.245
-        theta[ixs, i] = 0.245
+        theta[i, ixs] = theta[ixs, i] = value
     to_remove = np.where(np.sum((theta != 0).astype(int), axis=1) > 4)[0]
     for t in to_remove:
         edges = np.nonzero(theta[t, :])[0]
         random_state.shuffle(edges)
         removable = edges[:-4]
-        theta[t, removable] = 0
-        theta[removable, t] = 0
-    np.fill_diagonal(theta, 1)
+        theta[t, removable] = theta[removable, t] = 0
 
+    np.fill_diagonal(theta, 1)
     assert is_pos_def(theta)
+    return theta
+
+
+def data_Meinshausen_Yuan(
+        p=198, h=2, n=200, T=10, random_state=None, **kwargs):
+    random_state = check_random_state(random_state)
+    nodes_locations = [list(random_state.uniform(size=2)) for i in range(p)]
+    probs = _compute_probabilities(nodes_locations, random_state)
+    theta = _theta_my(probs, p, random_state=random_state)
+
     latent = np.zeros((h, p))
     for i in range(h):
         for j in range(p):
@@ -124,32 +126,7 @@ def data_Meinshausen_Yuan(
         locations.append(locs)
         probs = _compute_probabilities(locs, random_state, theta_prev != 0)
 
-        theta = np.zeros((p, p))
-        for i in range(p):
-            ix = np.argsort(probs[i, :])[::-1]
-            no_nonzero = np.sum((theta != 0).astype(int), axis=1)[i]
-            ixs = []
-            sel = 1
-            j = 0
-            while sel <= 4 - no_nonzero and j < p:
-                if ix[j] < i:
-                    j += 1
-                    continue
-                ixs.append(ix[j])
-                sel += 1
-                j += 1
-            theta[i, ixs] = 0.245
-            theta[ixs, i] = 0.245
-        to_remove = np.where(np.sum((theta != 0).astype(int), axis=1) > 4)[0]
-        for t in to_remove:
-            edges = np.nonzero(theta[t, :])[0]
-            random_state.shuffle(edges)
-            removable = edges[:-4]
-            theta[t, removable] = 0
-            theta[removable, t] = 0
-        np.fill_diagonal(theta, 1)
-
-        assert is_pos_def(theta)
+        theta = _theta_my(probs, p, random_state=random_state)
         thetas.append(theta)
         theta_observed = theta - L
         assert is_pos_def(theta_observed)
@@ -164,37 +141,13 @@ def data_Meinshausen_Yuan(
 
 
 def data_Meinshausen_Yuan_sparse_latent(
-        p=198, h=2, n=200, T=10, random_state=None):
+        p=198, h=2, n=200, T=10, random_state=None, **kwargs):
     random_state = check_random_state(random_state)
     nodes_locations = [
         list(random_state.uniform(size=2)) for i in range(p + h)
     ]
     probs = _compute_probabilities(nodes_locations, random_state)
-    theta = np.zeros((p, p))
-    for i in range(p):
-        ix = np.argsort(probs[i, :-h])[::-1]
-        no_nonzero = np.sum((theta != 0).astype(int), axis=1)[i]
-        ixs = []
-        sel = 1
-        j = 0
-        while sel <= 4 - no_nonzero and j < p:
-            if ix[j] < i:
-                j += 1
-                continue
-            ixs.append(ix[j])
-            sel += 1
-            j += 1
-        theta[ixs, i] = theta[i, ixs] = 0.245
-    to_remove = np.where(np.sum((theta != 0).astype(int), axis=1) > 4)[0]
-    for t in to_remove:
-        edges = np.nonzero(theta[t, :])[0]
-        random_state.shuffle(edges)
-        removable = edges[:-4]
-        theta[t, removable] = 0
-        theta[removable, t] = 0
-    np.fill_diagonal(theta, 1)
-
-    assert is_pos_def(theta)
+    theta = _theta_my(probs[:, :-h], p, random_state=random_state)
 
     latent = np.zeros((h, p + h))
     to_put = (p + h) * 0.5
@@ -258,31 +211,7 @@ def data_Meinshausen_Yuan_sparse_latent(
         probs_lat = _compute_probabilities(
             locs[-h:], random_state, lat_prev != 0)
 
-        theta = np.zeros((p, p))
-        for i in range(p):
-            ix = np.argsort(probs_theta[i, :])[::-1]
-            no_nonzero = np.sum((theta != 0).astype(int), axis=1)[i]
-            ixs = []
-            sel = 1
-            j = 0
-            while sel <= 4 - no_nonzero and j < p:
-                if ix[j] < i:
-                    j += 1
-                    continue
-                ixs.append(ix[j])
-                sel += 1
-                j += 1
-            theta[i, ixs] = 0.245
-            theta[ixs, i] = 0.245
-        to_remove = np.where(np.sum((theta != 0).astype(int), axis=1) > 4)[0]
-        for t in to_remove:
-            edges = np.nonzero(theta[t, :])[0]
-            random_state.shuffle(edges)
-            removable = edges[:-4]
-            theta[t, removable] = 0
-            theta[removable, t] = 0
-        np.fill_diagonal(theta, 1)
-        assert is_pos_def(theta)
+        theta = _theta_my(probs_theta, p, random_state=random_state)
         thetas_O.append(theta)
 
         lat = np.zeros((h, h))
@@ -332,7 +261,7 @@ def data_Meinshausen_Yuan_sparse_latent(
 
 
 def make_ell(n_dim_obs=100, n_dim_lat=10):
-    """Doc."""
+    """Make ell matrix."""
     K_HO = np.zeros((n_dim_lat, n_dim_obs))
     for i in range(n_dim_lat):
         percentage = int(n_dim_obs * 0.99)
@@ -352,7 +281,8 @@ def make_ell(n_dim_obs=100, n_dim_lat=10):
     return L, K_HO
 
 
-def make_starting(n_dim_obs=100, n_dim_lat=10, degree=2, normalize=False):
+def make_starting(
+        n_dim_obs=100, n_dim_lat=10, degree=2, normalize=False, eps=1e-2):
     """Generate starting theta, theta_observed, L, K_HO."""
     L, K_HO = make_ell(n_dim_obs, n_dim_lat)
 
@@ -388,10 +318,12 @@ def make_starting(n_dim_obs=100, n_dim_lat=10, degree=2, normalize=False):
                 indices = np.random.choice(possible_idx, n_choice)
                 theta[i, indices] = theta[indices, i] = .5 / degree
 
-    assert (is_pos_def(theta))
-    theta_observed = theta - L
-    assert (is_pos_def(theta_observed))
-    return theta, theta_observed, L, K_HO
+    if not is_pos_def(theta):
+        theta += np.diag(np.sum(theta, axis=1) + eps)
+    if not is_pos_def(theta - L):
+        theta += np.diag(eps + np.diag(L))
+    assert (is_pos_def(theta - L))
+    return theta, theta - L, L, K_HO
 
 
 def _update_theta_l2(
@@ -413,7 +345,7 @@ def _update_theta_l2(
     return theta
 
 
-def _update_theta_l1(theta_init, no, n_dim_obs):
+def _update_theta_l1(theta_init, no, n_dim_obs, eps=1e-2):
     theta = theta_init.copy()
     rows = np.zeros(no)
     cols = np.zeros(no)
@@ -425,7 +357,8 @@ def _update_theta_l1(theta_init, no, n_dim_obs):
             0.12, 0, 0
         ]) if theta[r, c] == 0 else .06  # np.random.rand(1) * .35
         theta[c, r] = theta[r, c]
-    assert (is_pos_def(theta))
+    if not is_pos_def(theta):
+        theta += np.diag(np.sum(theta, axis=1) + eps)
     return theta
 
 
@@ -493,7 +426,8 @@ def make_covariance(
 
         assert np.linalg.matrix_rank(L) == n_dim_lat
         assert (is_pos_semidef(L))
-        assert (is_pos_def(theta - L))
+        if not is_pos_def(theta - L):
+            theta += np.diag(epsilon + np.diag(L))
 
         thetas.append(theta)
         thetas_obs.append(theta - L)
@@ -720,7 +654,7 @@ def make_ma_xue_zou(
 
 
 def make_ma_xue_zou_rand_k(
-        n_dim_obs=12, n_latent=3, T=1, epsilon=1e-3, sparsity=0.1):
+        n_dim_obs=12, n_latent=3, T=1, epsilon=1e-3, sparsity=0.1, **kwargs):
     """Generate the dataset as in Ma, Xue, Zou (2012)."""
     # p = n_dim_obs + n_latent  # int(n_dim_obs * 0.05)
     p = n_dim_obs + int(n_dim_obs * 0.05)

diff --git a/regain/datasets/ising.py b/regain/datasets/ising.py
@@ -33,7 +33,6 @@
 import itertools
 
 import numpy as np
-import networkx as nx
 import warnings
 
 
@@ -58,7 +57,7 @@ def ising_theta_generator(
         n_dim_obs=10, n=100, T=10, mode='l1', time_on_axis='first', change=1,
         responses=[-1, 1], random_graph='scale-free', probability=0.3,
         degree=2):
-    """Generates adjacency matix for Ising graphical model.
+    """Generates adjacency matrix for Ising graphical model.
 
     Parameters
     ----------
@@ -95,6 +94,7 @@ def ising_theta_generator(
     list:
         List of adjaceny matrix of length T.
     """
+    import networkx as nx
     if random_graph.lower() == 'erdos-renyi':
         graph = nx.random_graphs.fast_gnp_random_graph(n=n_dim_obs,
                                                        p=probability)

diff --git a/regain/datasets/poisson.py b/regain/datasets/poisson.py
@@ -29,7 +29,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import warnings
 
-import networkx as nx
 import numpy as np
 
 
@@ -88,6 +87,7 @@ def poisson_theta_generator(n_dim_obs=10,
     list:
         List of adjaceny matrix of length T.
     """
+    import networkx as nx
     if random_graph.lower() == 'erdos-renyi':
         graph = nx.random_graphs.fast_gnp_random_graph(n=n_dim_obs,
                                                        p=probability)