Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions mapalign/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,12 @@ def compute_nearest_neighbor_graph(K, n_neighbors=50):
return K


def compute_affinity(X, method='markov', eps=None, metric='euclidean'):
def compute_affinity(X, method='markov', f_neighbors = 0.01, eps=None, metric='euclidean'):
"""Compute the similarity or affinity matrix between the samples in X

:param X: A set of samples with number of rows > 1
:param method: 'markov' or 'cauchy' kernel (default: markov)
:param f_neighbors: Fraction of nearest neighbors to consider for automatic eps calculation
:param eps: scaling factor for kernel
:param metric: metric to compute pairwise distances
:return: a similarity matrix
Expand All @@ -147,8 +148,8 @@ def compute_affinity(X, method='markov', eps=None, metric='euclidean'):
import numpy as np
D = squareform(pdist(X, metric=metric))
if eps is None:
k = int(max(2, np.round(D.shape[0] * 0.01)))
eps = 2 * np.median(np.sort(D, axis=0)[k+1, :])**2
k = int(max(2, np.round(D.shape[0] * f_neighbors)))
eps = np.median(np.sort(D, axis=0)[0:k+1, :]**2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if i remember this correctly, eps is based on the k+1 th entry rather than than everything upto that point. it basically provides a scaling factor given the sorted distance.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The approach currently implemented is more liberal than the one I propose and both of them are correct. All manifold learning approaches are based on local similarities and these should be emphasized - I think the median of all the values up to the k-th captures this property more accurately. Besides, I think that parametrization of the kernel in terms of average distances to a certain fraction of nearest neighbors is more natural, because it doesn't depend on the scale of the affinities that one uses.

if method == 'markov':
affinity_matrix = np.exp(-(D * D) / eps)
elif method == 'cauchy':
Expand Down
155 changes: 116 additions & 39 deletions mapalign/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
"""

import numpy as np
import scipy.sparse as sps

has_sklearn = True
try:
import sklearn
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics.pairwise import rbf_kernel
except ImportError:
has_sklearn = False

Expand Down Expand Up @@ -77,32 +81,8 @@ def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0,
raise ImportError('Checks require scikit-learn, but not found')

ndim = L.shape[0]
if overwrite:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

overwrite is no longer supported in the rewrite. i think this should still be used as a memory saving option.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mfalkiewicz - this is not supported in the refactor

L_alpha = L
else:
L_alpha = L.copy()

if alpha > 0:
# Step 2
d = np.array(L_alpha.sum(axis=1)).flatten()
d_alpha = np.power(d, -alpha)
if use_sparse:
L_alpha.data *= d_alpha[L_alpha.indices]
L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
L_alpha.data *= d_alpha[L_alpha.indices]
L_alpha = sps.csr_matrix(L_alpha.transpose().toarray())
else:
L_alpha = d_alpha[:, np.newaxis] * L_alpha
L_alpha = L_alpha * d_alpha[np.newaxis, :]

# Step 3
d_alpha = np.power(np.array(L_alpha.sum(axis=1)).flatten(), -1)
if use_sparse:
L_alpha.data *= d_alpha[L_alpha.indices]
else:
L_alpha = d_alpha[:, np.newaxis] * L_alpha

M = L_alpha
M = _compute_markov_matrix(L, use_sparse = use_sparse, alpha = alpha)

from scipy.sparse.linalg import eigs, eigsh
if eigen_solver is None:
Expand Down Expand Up @@ -130,6 +110,37 @@ def compute_diffusion_map(L, alpha=0.5, n_components=None, diffusion_time=0,
return_result)


def _compute_markov_matrix(A, use_sparse, alpha = 0):
"""
Computes Markov matrix from affinity matrix. Density normalization is optional.

:param A: Affinity matrix
:param alpha: Diffusion operator parameter
:return: Transition matrix
"""

if alpha > 0:
# Step 2
d = np.array(A.sum(axis=1)).flatten()
d_alpha = np.power(d, -alpha)
if use_sparse:
A.data *= d_alpha[A.indices]
A = sps.csr_matrix(A.transpose().toarray())
A.data *= d_alpha[A.indices]
A = sps.csr_matrix(A.transpose().toarray())
else:
A = d_alpha[:, np.newaxis] * A
A = A * d_alpha[np.newaxis, :]

# Step 3
d_alpha = np.power(np.array(A.sum(axis=1)).flatten(), -1)
if use_sparse:
A.data *= d_alpha[A.indices]
else:
A = d_alpha[:, np.newaxis] * A

return A

def _step_5(lambdas, vectors, ndim, n_components, diffusion_time, return_result):
"""
This is a helper function for diffusion map computation.
Expand Down Expand Up @@ -345,9 +356,8 @@ def _get_affinity_matrix(self, X, Y=None):
"""
if self.affinity == 'precomputed':
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == 'nearest_neighbors':
if sparse.issparse(X):
if sps.issparse(X):
warnings.warn("Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity")
Expand All @@ -356,27 +366,24 @@ def _get_affinity_matrix(self, X, Y=None):
self.n_neighbors_ = (self.n_neighbors
if self.n_neighbors is not None
else max(int(X.shape[0] / 10), 1))
self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_,
affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_,
include_self=True,
n_jobs=self.n_jobs)
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
self.affinity_matrix_.T)
return self.affinity_matrix_
if self.affinity == 'rbf':
self.gamma_ = (self.gamma
if self.gamma is not None else 1.0 / X.shape[1])
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
if self.affinity in ['markov', 'cauchy']:
from .dist import compute_affinity
self.affinity_matrix_ = compute_affinity(X,
affinity_matrix_ = compute_affinity(X,
method=self.affinity,
eps=self.gamma,
metric=self.metric)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the function description should be modified if this function is not going to return the affinity matrix.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I re-added the return of affinity_matrix_, but not as a field in self. The reason is that in the code I am currently writing I want to be able to bind results of several affinity calculations to different fields in the object.


return affinity_matrix_

def fit(self, X, y=None):
"""Fit the model from data in X.
Expand Down Expand Up @@ -413,14 +420,15 @@ def fit(self, X, y=None):
raise ValueError(("'affinity' is expected to be an affinity "
"name or a callable. Got: %s") % self.affinity)

affinity_matrix = self._get_affinity_matrix(X)
self.affinity_ = self._get_affinity_matrix(X)

if self.use_variant:
self.embedding_ = compute_diffusion_map_psd(affinity_matrix,
self.embedding_ = compute_diffusion_map_psd(self.affinity_matrix_,
alpha=self.alpha,
n_components=self.n_components,
diffusion_time=self.diffusion_time)
else:
self.embedding_ = compute_diffusion_map(affinity_matrix,
self.embedding_ = compute_diffusion_map(self.affinity_matrix_,
alpha=self.alpha,
n_components=self.n_components,
diffusion_time=self.diffusion_time,
Expand All @@ -447,3 +455,72 @@ def fit_transform(self, X, y=None):
"""
self.fit(X)
return self.embedding_

class AlternatingDiffusion(DiffusionMapEmbedding):
"""Alternating diffusion framework that captures common souces of variability acrossdatasets.
"""

def _diffuse(self, K, t):
"""
Moves Markov kernel K t steps forward in time.

:param K: Markov kernel
:param t: time
:return: kernel moved forward in time
"""

return np.power(K, t)


def fit(self, D1, D2, time):
"""

:param D1: Dataset 1
:param D2: Dataset 2
:param time: Time steps for simple diffusion
:return: self
"""

from sklearn.utils import check_array, check_random_state
D1 = check_array(D1, ensure_min_samples=2, estimator=self)
D2 = check_array(D2, ensure_min_samples=2, estimator=self)

random_state = check_random_state(self.random_state)
if isinstance(self.affinity, (str,)):
if self.affinity not in set(("nearest_neighbors", "rbf",
"markov", "cauchy",
"precomputed")):
raise ValueError(("%s is not a valid affinity. Expected "
"'precomputed', 'rbf', 'nearest_neighbors' "
"or a callable.") % self.affinity)
elif not callable(self.affinity):
raise ValueError(("'affinity' is expected to be an affinity "
"name or a callable. Got: %s") % self.affinity)

self.affinity_D1_ = self._get_affinity_matrix(D1)
self.affinity_D2_ = self._get_affinity_matrix(D2)

self.Markov_D1_ = _compute_markov_matrix(self.affinity_D1_, use_sparse = False, alpha = self.alpha)
self.Markov_D2_ = _compute_markov_matrix(self.affinity_D2_, use_sparse = False, alpha = self.alpha)

self.Markov_D1_t_ = self._diffuse(self.Markov_D1_, time)
self.Markov_D2_t_ = self._diffuse(self.Markov_D2_, time)

# Create a joint kernel

self.joint_kernel_ = self.Markov_D1_t_.dot(self.Markov_D2_t_)

# Final embedding

self.embedding_ = compute_diffusion_map(self.joint_kernel_, self.alpha,
n_components=self.n_components,
eigen_solver=self.eigen_solver)

return self


def fit_transform(self, D1, D2, time):

self.fit(D1, D2, time)
return self.embedding_