mllam · SimonKamuk · Feb 12, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 12, 2025
diff --git a/.cirun.yml b/.cirun.yml
@@ -6,7 +6,7 @@ runners:
     # https://aws.amazon.com/ec2/instance-types/g4/
     instance_type: "g4ad.xlarge"
     # Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04), Frankfurt region
-    machine_image: "ami-0ba41b554b28d24a4"
+    machine_image: "ami-0266e0ac094b2f8b9"
     # use Frankfurt region
     region: "eu-central-1"
     preemptible: false

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fix duplicate tensor copy to CPU [\#106](https://github.com/mllam/neural-lam/pull/106) @observingClouds
 
+- Fix bug where the inverse_softplus used in clamping caused nans in the gradients [\#123](https://github.com/mllam/neural-lam/pull/123) @SimonKamuk
+
 ### Maintenance
 - update ci/cd testing setup to install torch version compatible with neural-lam
   dependencies [\#115](https://github.com/mllam/neural-lam/pull/115), @leifdenby

diff --git a/neural_lam/utils.py b/neural_lam/utils.py
@@ -313,18 +313,21 @@ def inverse_softplus(x, beta=1, threshold=20):
     """
     Inverse of torch.nn.functional.softplus
 
-    For x*beta above threshold, returns linear function for numerical
-    stability.
+    Input is clamped to approximately positive values of x, and the function is
+    linear for inputs above x*beta for numerical stability.
 
-    Input is clamped to x > ln(1+1e-6)/beta which is approximately positive
-    values of x.
-    Note that this torch.clamp_min will make gradients 0, but this is not a
+    Note that this torch.clamp will make gradients 0, but this is not a
     problem as values of x that are this close to 0 have gradients of 0 anyhow.
     """
-    non_linear_part = (
-        torch.log(torch.clamp_min(torch.expm1(x * beta), 1e-6)) / beta
+    x_clamped = torch.clamp(
+        x, min=torch.log(torch.tensor(1e-6 + 1)) / beta, max=threshold / beta
     )
-    x = torch.where(x * beta <= threshold, non_linear_part, x)
+
+    non_linear_part = torch.log(torch.expm1(x_clamped * beta)) / beta
+
+    below_threshold = x * beta <= threshold
+
+    x = torch.where(condition=below_threshold, input=non_linear_part, other=x)
 
     return x