From 750267eab697c0fa0636800c722ed1d2754ec656 Mon Sep 17 00:00:00 2001
From: Chris Choy <chrischoy@ai.stanford.edu>
Date: Tue, 26 May 2020 15:30:05 -0700
Subject: [PATCH] gradcheck update to torch 1.5

---
 MinkowskiEngine/utils/gradcheck.py | 138 +++++++++++++++++------------
 tests/pool.py                      |   5 ++
 2 files changed, 84 insertions(+), 59 deletions(-)

diff --git a/MinkowskiEngine/utils/gradcheck.py b/MinkowskiEngine/utils/gradcheck.py
index 095b73b6..ede8dc79 100644
--- a/MinkowskiEngine/utils/gradcheck.py
+++ b/MinkowskiEngine/utils/gradcheck.py
@@ -24,83 +24,98 @@
 import torch
 import torch.testing
 import warnings
+from typing import Callable, Union, Optional
 
 from torch.autograd.gradcheck import _as_tuple, _differentiable_outputs, get_analytical_jacobian, get_numerical_jacobian, iter_tensors
 
 
-def gradcheck(func,
-              inputs,
-              eps=1e-6,
-              atol=1e-5,
-              rtol=1e-3,
-              raise_exception=True):
+def gradcheck(
+    func,
+    inputs,
+    eps: float = 1e-6,
+    atol: float = 1e-5,
+    rtol: float = 1e-3,
+    raise_exception: bool = True,
+    check_sparse_nnz: bool = False,
+    nondet_tol: float = 0.0
+) -> bool:
     r"""Check gradients computed via small finite differences against analytical
-    gradients w.r.t. tensors in :attr:`inputs` that are of floating point type
+    gradients w.r.t. tensors in :attr:`inputs` that are of floating point or complex type
     and with ``requires_grad=True``.
-
-    The check between numerical and analytical gradients has the same behaviour as
-    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_,
-    i.e., it checks that
-
-    .. math::
-
-        \lvert a - n \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert n \rvert
-
-    holds for all elements of analytical gradient :math:`a` and numerical
-    gradient :math:`n`.
-
+    The check between numerical and analytical gradients uses :func:`~torch.allclose`.
     .. note::
         The default values are designed for :attr:`input` of double precision.
         This check will likely fail if :attr:`input` is of less precision, e.g.,
         ``FloatTensor``.
-
     .. warning::
        If any checked tensor in :attr:`input` has overlapping memory, i.e.,
        different indices pointing to the same memory address (e.g., from
        :func:`torch.expand`), this check will likely fail because the numerical
        gradients computed by point perturbation at such indices will change
        values at all other indices that share the same memory address.
-
     Args:
         func (function): a Python function that takes Tensor inputs and returns
             a Tensor or a tuple of Tensors
-        inputs (tuple of Tensor): inputs to the function
+        inputs (tuple of Tensor or Tensor): inputs to the function
         eps (float, optional): perturbation for finite differences
         atol (float, optional): absolute tolerance
         rtol (float, optional): relative tolerance
         raise_exception (bool, optional): indicating whether to raise an exception if
             the check fails. The exception gives more information about the
             exact nature of the failure. This is helpful when debugging gradchecks.
-
+        check_sparse_nnz (bool, optional): if True, gradcheck allows for SparseTensor input,
+            and for any SparseTensor at input, gradcheck will perform check at nnz positions only.
+        nondet_tol (float, optional): tolerance for non-determinism. When running
+            identical inputs through the differentiation, the results must either match
+            exactly (default, 0.0) or be within this tolerance.
     Returns:
         True if all differences satisfy allclose condition
     """
+    def fail_test(msg):
+        if raise_exception:
+            raise RuntimeError(msg)
+        return False
+
     tupled_inputs = _as_tuple(inputs)
+    if any(t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)) and not check_sparse_nnz:
+        return fail_test('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.')
 
-    # Make sure that gradients are saved for all inputs
+    # Make sure that gradients are saved for at least one input
     any_input_requiring_grad = False
-    for inp in tupled_inputs:
-        if isinstance(inp, torch.Tensor):
-            if inp.requires_grad:
-                if inp.dtype != torch.float64:
-                    warnings.warn(
-                        'At least one of the inputs that requires gradient '
-                        'is not of double precision floating point. '
-                        'This check will likely fail if all the inputs are '
-                        'not of double precision floating point. ')
-                any_input_requiring_grad = True
+    for idx, inp in enumerate(tupled_inputs):
+        if isinstance(inp, torch.Tensor) and inp.requires_grad:
+            if not (inp.dtype == torch.float64 or inp.dtype == torch.complex128):
+                warnings.warn(
+                    'The {}th input requires gradient and '
+                    'is not a double precision floating point or complex. '
+                    'This check will likely fail if all the inputs are '
+                    'not of double precision floating point or complex. ')
+            content = inp._values() if inp.is_sparse else inp
+            if content.layout is not torch._mkldnn and any([s == 0 for s in content.stride()]):
+                raise RuntimeError(
+                    'The {}th input has a dimension with stride 0. gradcheck only '
+                    'supports inputs that are non-overlapping to be able to '
+                    'compute the numerical gradients correctly. You should call '
+                    '.contiguous on the input before passing it to gradcheck.')
+            any_input_requiring_grad = True
             inp.retain_grad()
     if not any_input_requiring_grad:
         raise ValueError(
             'gradcheck expects at least one input tensor to require gradient, '
             'but none of the them have requires_grad=True.')
 
-    output = _differentiable_outputs(func.apply(*inputs))
+    func_out = func.apply(*tupled_inputs)
+    output = _differentiable_outputs(func_out)
 
-    def fail_test(msg):
-        if raise_exception:
-            raise RuntimeError(msg)
-        return False
+    if not output:
+        for i, o in enumerate(func_out):
+            def fn(input):
+                return _as_tuple(func.apply(*input))[i]
+            numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
+            for n in numerical:
+                if torch.ne(n, 0).sum() > 0:
+                    return fail_test('Numerical gradient for function expected to be zero')
+        return True
 
     for i, o in enumerate(output):
         if not o.requires_grad:
@@ -109,44 +124,49 @@ def fail_test(msg):
         def fn(input):
             return _as_tuple(func.apply(*input))[i]
 
-        analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(
-            tupled_inputs, o)
-        numerical = get_numerical_jacobian(fn, inputs, eps=eps)
+        analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o, nondet_tol=nondet_tol)
+        numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
 
         if not correct_grad_sizes:
             return fail_test('Analytical gradient has incorrect size')
 
         for j, (a, n) in enumerate(zip(analytical, numerical)):
             if a.numel() != 0 or n.numel() != 0:
-                succ_index = (a - n).abs() <= (atol + rtol * n.abs())
-                if not succ_index.all():
-                    return fail_test(
-                        'Jacobian mismatch for output %d with respect to input %d,\n'
-                        'numerical:%s\nanalytical:%s\nsuccess:%s\ndifference a - n:%s\n'
-                        % (i, j, n, a, succ_index, a - n))
+                if not torch.allclose(a, n, rtol, atol):
+                    return fail_test('Jacobian mismatch for output %d with respect to input %d,\n'
+                                     'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
 
         if not reentrant:
-            return fail_test(
-                'Backward is not reentrant, i.e., running backward with same '
-                'input and grad_output multiple times gives different values, '
-                'although analytical gradient matches numerical gradient')
+            return fail_test('Backward is not reentrant, i.e., running backward with same '
+                             'input and grad_output multiple times gives different values, '
+                             'although analytical gradient matches numerical gradient. '
+                             'The tolerance for nondeterminism was {}.'.format(nondet_tol))
 
     # check if the backward multiplies by grad_output
-    output = _differentiable_outputs(func.apply(*inputs))
+    output = _differentiable_outputs(func.apply(*tupled_inputs))
     if any([o.requires_grad for o in output]):
-        diff_input_list = list(iter_tensors(inputs, True))
+        diff_input_list = list(iter_tensors(tupled_inputs, True))
         if not diff_input_list:
             raise RuntimeError("no Tensors requiring grad found in input")
-        grads_input = torch.autograd.grad(
-            output,
-            diff_input_list, [torch.zeros_like(o) for o in output],
-            allow_unused=True)
+        grads_input = torch.autograd.grad(output, diff_input_list,
+                                          [torch.zeros_like(o, memory_format=torch.legacy_contiguous_format) for o in output],
+                                          allow_unused=True)
         for gi, i in zip(grads_input, diff_input_list):
             if gi is None:
                 continue
+            if isinstance(gi, torch.Tensor) and gi.layout != torch.strided:
+                if gi.layout != i.layout:
+                    return fail_test('grad is incorrect layout (' + str(gi.layout) + ' is not ' + str(i.layout) + ')')
+                if gi.layout == torch.sparse_coo:
+                    if gi.sparse_dim() != i.sparse_dim():
+                        return fail_test('grad is sparse tensor, but has incorrect sparse_dim')
+                    if gi.dense_dim() != i.dense_dim():
+                        return fail_test('grad is sparse tensor, but has incorrect dense_dim')
+                gi = gi.to_dense()
+                i = i.to_dense()
             if not gi.eq(0).all():
                 return fail_test('backward not multiplied by grad_output')
-            if gi.type() != i.type():
+            if gi.dtype != i.dtype or gi.device != i.device or gi.is_sparse != i.is_sparse:
                 return fail_test("grad is incorrect type")
             if gi.size() != i.size():
                 return fail_test('grad is incorrect size')
diff --git a/tests/pool.py b/tests/pool.py
index 3112d7a2..01f5e51b 100644
--- a/tests/pool.py
+++ b/tests/pool.py
@@ -211,6 +211,11 @@ def test_global_maxpool(self):
         self.assertTrue(
             gradcheck(fn, (input.F, input.coords_key, None, input.coords_man)))
 
+        if torch.cuda.is_available():
+            input_cuda = input.to(torch.device(0))
+            output_cuda = pool(input)
+            self.assertTrue(torch.allclose(output_cuda.F.cpu(), output.F))
+
     def test_unpool(self):
         in_channels, out_channels, D = 2, 3, 2
         coords, feats, labels = data_loader(in_channels)