Merge pull request #61 from samuelstjean/threading

Add threading option
samuelstjean · Oct 22, 2017 · 823ac3c · 823ac3c
2 parents d91a786 + 976d540
commit 823ac3c
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,9 +2,11 @@
 
 ## [0.6] - Development version
 
-- PIESNO will now warn if less than 1% of noisy voxels were identified, which might indicate that something have gone wrong during the noise estimation.
+- PIESNO will now warn if less than 1% of noisy voxels were identified, which might indicate that something has gone wrong during the noise estimation.
 - On python >= 3.4, --mp_method [a_valid_start_method](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) can now be used to control behavior in the multiprocessing loop.
 - A new option --split_b0s can be specified to split the b0s equally amongst the training data.
+- A new (kind of experimental) option --use_f32 can be specified to use the float32 mode of spams and reduce ram usage.
+- A new option --use_threading can be specified to disable python multiprocessing and solely rely on threading capabilities of the linear algebra libs during denoising.
 - Fixed crash in option --noise_est local_std when --cores 1 was also supplied.
 - setup.py and requirements.txt will now fetch spams v2.6, with patches for numpy 1.12 support.
 - The GSL library and associated headers are now bundled for all platforms.

diff --git a/nlsam/denoiser.py b/nlsam/denoiser.py
@@ -13,6 +13,13 @@
 
 from scipy.sparse import lil_matrix
 
+try:
+    # python 2
+    from itertools import imap
+except ImportError:
+    # python 3
+    imap = map
+
 try:
     import spams
     warnings.filterwarnings("ignore", category=FutureWarning, module='spams')
@@ -24,7 +31,8 @@
 
 def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
                   mask=None, is_symmetric=False, n_cores=None, split_b0s=False,
-                  subsample=True, n_iter=10, b0_threshold=10, verbose=False, mp_method=None):
+                  subsample=True, n_iter=10, b0_threshold=10, dtype=np.float64,
+                  use_threading=False, verbose=False, mp_method=None):
     """Main nlsam denoising function which sets up everything nicely for the local
     block denoising.
 
@@ -62,6 +70,14 @@ def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
         Maximum number of iterations for the reweighted l1 solver.
     b0_threshold : int, default 10
         A b-value below b0_threshold will be considered as a b0 image.
+    dtype : np.float32 or np.float64, default np.float64
+        Precision to use for inner computations. Note that np.float32 should only be used for
+        very, very large datasets (that is, your ram starts swappping) as it can lead to numerical precision errors.
+    use_threading : bool, default False
+        Do not use multiprocessing, but rather rely on the multithreading capabilities of your numerical solvers.
+        While this mode is more memory friendly, it is also slower than using the multiprocessing mode (the default).
+        Moreover, it also assumes that your blas/lapack/spams library are built with multithreading, so be sure to check
+        that your computer is using multiple cores or the algorithm will just take much longer to complete.
     verbose : bool, default False
         print useful messages.
     mp_method : string
@@ -89,6 +105,9 @@ def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
         raise ValueError('Block shape {} and data shape {} are not of the same '
                          'length'.format(data.shape, block_size.shape))
 
+    if not ((dtype == np.float32) or (dtype == np.float64)):
+        raise ValueError('dtype should be either np.float32 or np.float64, but is {}'.format(dtype))
+
     b0_loc = np.where(bvals <= b0_threshold)[0]
     dwis = np.where(bvals > b0_threshold)[0]
     num_b0s = len(b0_loc)
@@ -147,7 +166,7 @@ def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
     divider = np.zeros(data.shape[-1])
 
     # Put all idx + b0 in this array in each iteration
-    to_denoise = np.empty(data.shape[:-1] + (block_size[-1] + 1,), dtype=np.float64)
+    to_denoise = np.empty(data.shape[:-1] + (block_size[-1] + 1,), dtype=dtype)
 
     for i, idx in enumerate(indexes, start=1):
         b0_loc = tuple((next(split_b0s_idx),))
@@ -163,8 +182,9 @@ def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
                                                           variance,
                                                           n_iter=n_iter,
                                                           mask=mask,
-                                                          dtype=np.float64,
+                                                          dtype=dtype,
                                                           n_cores=n_cores,
+                                                          use_threading=use_threading,
                                                           verbose=verbose,
                                                           mp_method=mp_method)
 
@@ -173,7 +193,7 @@ def nlsam_denoise(data, sigma, bvals, bvecs, block_size,
 
 
 def local_denoise(data, block_size, overlap, variance, n_iter=10, mask=None,
-                  dtype=np.float64, n_cores=None, verbose=False, mp_method=None):
+                  dtype=np.float64, n_cores=None, use_threading=False, verbose=False, mp_method=None):
     if verbose:
         logger.setLevel(logging.INFO)
 
@@ -213,12 +233,16 @@ def local_denoise(data, block_size, overlap, variance, n_iter=10, mask=None,
     param_alpha['D'] /= np.sqrt(np.sum(param_alpha['D']**2, axis=0, keepdims=True, dtype=dtype))
     param_D['D'] = param_alpha['D']
 
-    del train_data, X
+    del train_data, X, mask_col
 
-    param_alpha['numThreads'] = 1
-    param_D['numThreads'] = 1
+    if use_threading or (n_cores == 1):
+        param_alpha['numThreads'] = n_cores
+        param_D['numThreads'] = n_cores
+    else:
+        param_alpha['numThreads'] = 1
+        param_D['numThreads'] = 1
 
-    arglist = [(data[:, :, k:k + block_size[2]],
+    arglist = ((data[:, :, k:k + block_size[2]],
                 mask[:, :, k:k + block_size[2]],
                 variance[:, :, k:k + block_size[2]],
                 block_size,
@@ -227,21 +251,23 @@ def local_denoise(data, block_size, overlap, variance, n_iter=10, mask=None,
                 param_D,
                 dtype,
                 n_iter)
-               for k in range(data.shape[2] - block_size[2] + 1)]
+               for k in range(data.shape[2] - block_size[2] + 1))
 
-    time_multi = time()
-    parallel_processer = multiprocesser(_processer, n_cores=n_cores, mp_method=mp_method)
-    data_denoised = parallel_processer(arglist)
-    logger.info('Multiprocessing done in {0:.2f} mins.'.format((time() - time_multi) / 60.))
+    if use_threading:
+        data_denoised = imap(_processer, arglist)
+    else:
+        time_multi = time()
+        parallel_processer = multiprocesser(_processer, n_cores=n_cores, mp_method=mp_method)
+        data_denoised = parallel_processer(arglist)
+        logger.info('Multiprocessing done in {0:.2f} mins.'.format((time() - time_multi) / 60.))
 
     # Put together the multiprocessed results
     data_subset = np.zeros_like(data, dtype=np.float32)
     divider = np.zeros_like(data, dtype=np.int16)
-    ones = np.ones_like(data_denoised[0], dtype=np.int16)
 
-    for k in range(len(data_denoised)):
-        data_subset[:, :, k:k + block_size[2]] += data_denoised[k]
-        divider[:, :, k:k + block_size[2]] += ones
+    for k, content in enumerate(data_denoised):
+        data_subset[:, :, k:k + block_size[2]] += content
+        divider[:, :, k:k + block_size[2]] += 1
 
     data_subset /= divider
     return data_subset
@@ -295,7 +321,7 @@ def processer(data, mask, variance, block_size, overlap, param_alpha, param_D,
     X = im2col_nd(data, block_size, overlap)
     var_mat = np.median(im2col_nd(variance, block_size[:-1], overlap[:-1])[:, train_idx], axis=0)
     X_full_shape = X.shape
-    X = X[:, train_idx]
+    X = X[:, train_idx].astype(dtype)
 
     param_alpha['L'] = int(0.5 * X.shape[0])
 

diff --git a/nlsam/tests/test_scripts1.sh b/nlsam/tests/test_scripts1.sh
@@ -31,7 +31,7 @@ check_return_code $?
 # Test on niftis
 gunzip dwi_crop.nii.gz mask_crop.nii.gz sigma.nii.gz
 
-nlsam_denoising dwi_crop.nii dwi_nlsam.nii 1 bvals bvecs 5 -m mask_crop.nii -f --verbose --sh_order 0 --no_stabilization --load_sigma sigma.nii --is_symmetric
+nlsam_denoising dwi_crop.nii dwi_nlsam.nii 1 bvals bvecs 5 -m mask_crop.nii -f --verbose --sh_order 0 --no_stabilization --load_sigma sigma.nii --is_symmetric --use_threading
 check_return_code $?
 
 nlsam_denoising dwi_crop.nii dwi_nlsam.nii 1 bvals bvecs 5 -m mask_crop.nii -f --verbose --sh_order 0 --no_denoising --save_sigma sigma.nii --save_stab stab.nii

diff --git a/nlsam/tests/test_scripts2.sh b/nlsam/tests/test_scripts2.sh
@@ -25,7 +25,7 @@ check_return_code $?
 nlsam_denoising dwi_crop.nii.gz dwi_nlsam.nii.gz 1 bvals bvecs 5 -m mask_crop.nii.gz -f --verbose --sh_order 0 --load_sigma sigma.nii.gz --no_subsample --fix_implausible #--mp_method spawn
 check_return_code $?
 
-nlsam_denoising dwi_crop.nii.gz dwi_nlsam.nii.gz 1 bvals bvecs 5 -m mask_crop.nii.gz -f --verbose --sh_order 0 --noise_map noise.nii.gz --noise_mask pmask.nii.gz
+nlsam_denoising dwi_crop.nii.gz dwi_nlsam.nii.gz 1 bvals bvecs 5 -m mask_crop.nii.gz -f --verbose --sh_order 0 --noise_map noise.nii.gz --noise_mask pmask.nii.gz --use_f32
 check_return_code $?
 
 nlsam_denoising dwi_crop.nii.gz dwi_nlsam.nii.gz 1 bvals bvecs 5 -m mask_crop.nii.gz -f --verbose --sh_order 0 --load_sigma sigma.nii.gz --no_stabilization --no_denoising