Stabilizes dot accumulation precision

LoserCheems · LoserCheems · commit 59f9e3b0a091 · 2025-12-01T11:53:34.000+08:00
Accumulates the dot product in float32 before casting back to the original dtype to prevent precision loss with low-precision inputs
diff --git a/kernel_course/python_ops/dot.py b/kernel_course/python_ops/dot.py
@@ -19,8 +19,9 @@ def dot(
     x = x.reshape(-1)
     y = y.reshape(-1)
 
-    z = torch.tensor(0.0, device=x.device, dtype=x.dtype)
+    z = torch.tensor(0.0, device=x.device, dtype=torch.float32)
     for i in range(len(x)):
-        z += x[i] * y[i]
+        z += (x[i] * y[i]).to(torch.float32)
+    z = z.to(x.dtype)
 
     return z