TMP: Throwaway test commit

KshitijLakhani · KshitijLakhani · commit a385c4778c7c · 2025-11-13T19:56:30.000Z
Signed-off-by: Kshitij Lakhani &lt;klakhani@nvidia.com&gt;
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
@@ -438,7 +438,6 @@ def _setup_inputs(self):
         self.dp_size = self.mesh.shape.get(self.mesh_resource.dp_resource, 1)
         self.cp_size = self.mesh.shape.get(self.mesh_resource.cp_resource, 1)
         self.tp_size = self.mesh.shape.get(self.mesh_resource.tpsp_resource, 1)
-        breakpoint()
 
         key = jax.random.PRNGKey(0)
         q_key, k_key, v_key, bias_key, dropout_key = jax.random.split(key, 5)
@@ -663,7 +662,6 @@ def generate_random_segment_ids(
                             self.cp_reorder_fn(self.segment_pos_kv),
                         ),
                     )
-                    breakpoint()
                 case _:
                     raise ValueError(f"Unknown {self.seq_desc_format=}")
         else:
@@ -728,7 +726,6 @@ def to_dp_shardings(x):
 
                 self.seq_desc_sharding = jax.tree.map(to_dp_shardings, self.sequence_desciptor)
 
-        #jax.debug.breakpoint()
         if self.bias_shape == BiasShape._1HSS:
             self.bias_pspec = PartitionSpec(
                 None, self.mesh_resource.tpsp_resource, self.mesh_resource.cp_resource, None
diff --git a/tests/jax/utils.py b/tests/jax/utils.py
@@ -1465,7 +1465,6 @@ def assert_allclose(
     mismatch_counter = 0
     has_nonzero = jnp.any(actual != 0)
     print(f"has_nonzero: {has_nonzero}")
-    breakpoint()
     with np.printoptions(threshold=sys.maxsize):
         mismatch_mask = ~np.isclose(actual, desired, **tols) # True means mismatch
         diff_indices = np.argwhere(mismatch_mask)
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -506,6 +506,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           else
           {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensQ), 1, 0, actual_b, /*does not matter for single row*/actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if (devActualSeqlenQ) {
@@ -521,6 +522,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           }
           else {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenQ), 1, 0, actual_b, /*does not matter for single row*/actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if(devPtrCuSeqlensKV) {
@@ -538,6 +540,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           }
           else {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devPtrCuSeqlensKV), 1, 0, actual_b, /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if(devActualSeqlenKV) {
@@ -556,6 +559,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           else
           {
             print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t *>(devActualSeqlenKV), 1, 0, actual_b, /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
       }
@@ -597,6 +601,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
           layout_group, actual_b, b, h, hg, d_qk, d_v, static_cast<int32_t *>(devPtrSeqOffsetsQ),
           static_cast<int32_t *>(devPtrSeqOffsetsKV), ragged_offset_type, devOffsetsQ, devOffsetsK,
           devOffsetsV, devOffsetsO, devOffsetsS);
+      cudaDeviceSynchronize();
       NVTE_CHECK_CUDA(cudaGetLastError());
       if (print_tensors) {
         if (devPtrSeqOffsetsQ) {
@@ -614,6 +619,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devPtrSeqOffsetsQ), 1, 0, actual_b,
                 /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if (devOffsetsQ) {
@@ -631,6 +637,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int64_t *>(devOffsetsQ), 1, 0, actual_b,
                 /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if (devPtrSeqOffsetsKV) {
@@ -648,6 +655,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int32_t *>(devPtrSeqOffsetsKV), 1, 0, actual_b,
                 /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
         if (devOffsetsK) {
@@ -665,6 +673,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(
             print_tensor_elements_2<<<1, 1, 0, stream>>>(
                 static_cast<int64_t *>(devOffsetsK), 1, 0, actual_b,
                 /*does not matter for single row*/ actual_b);
+            cudaDeviceSynchronize();
           }
         }
       }
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py

Original file line number	Diff line number	Diff line change
`@@ -506,6 +506,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`506`	`506`	`else`
`507`	`507`	`{`
`508`	`508`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t >(devPtrCuSeqlensQ), 1, 0, actual_b, /does not matter for single row*/actual_b);`
	`509`	`+ cudaDeviceSynchronize();`
`509`	`510`	`}`
`510`	`511`	`}`
`511`	`512`	`if (devActualSeqlenQ) {`
`@@ -521,6 +522,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`521`	`522`	`}`
`522`	`523`	`else {`
`523`	`524`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t >(devActualSeqlenQ), 1, 0, actual_b, /does not matter for single row*/actual_b);`
	`525`	`+ cudaDeviceSynchronize();`
`524`	`526`	`}`
`525`	`527`	`}`
`526`	`528`	`if(devPtrCuSeqlensKV) {`
`@@ -538,6 +540,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`538`	`540`	`}`
`539`	`541`	`else {`
`540`	`542`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t >(devPtrCuSeqlensKV), 1, 0, actual_b, /does not matter for single row*/ actual_b);`
	`543`	`+ cudaDeviceSynchronize();`
`541`	`544`	`}`
`542`	`545`	`}`
`543`	`546`	`if(devActualSeqlenKV) {`
`@@ -556,6 +559,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`556`	`559`	`else`
`557`	`560`	`{`
`558`	`561`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(static_cast<int32_t >(devActualSeqlenKV), 1, 0, actual_b, /does not matter for single row*/ actual_b);`
	`562`	`+ cudaDeviceSynchronize();`
`559`	`563`	`}`
`560`	`564`	`}`
`561`	`565`	`}`
`@@ -597,6 +601,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`597`	`601`	`layout_group, actual_b, b, h, hg, d_qk, d_v, static_cast<int32_t *>(devPtrSeqOffsetsQ),`
`598`	`602`	`static_cast<int32_t *>(devPtrSeqOffsetsKV), ragged_offset_type, devOffsetsQ, devOffsetsK,`
`599`	`603`	`devOffsetsV, devOffsetsO, devOffsetsS);`
	`604`	`+ cudaDeviceSynchronize();`
`600`	`605`	`NVTE_CHECK_CUDA(cudaGetLastError());`
`601`	`606`	`if (print_tensors) {`
`602`	`607`	`if (devPtrSeqOffsetsQ) {`
`@@ -614,6 +619,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`614`	`619`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(`
`615`	`620`	`static_cast<int32_t *>(devPtrSeqOffsetsQ), 1, 0, actual_b,`
`616`	`621`	`/does not matter for single row/ actual_b);`
	`622`	`+ cudaDeviceSynchronize();`
`617`	`623`	`}`
`618`	`624`	`}`
`619`	`625`	`if (devOffsetsQ) {`
`@@ -631,6 +637,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`631`	`637`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(`
`632`	`638`	`static_cast<int64_t *>(devOffsetsQ), 1, 0, actual_b,`
`633`	`639`	`/does not matter for single row/ actual_b);`
	`640`	`+ cudaDeviceSynchronize();`
`634`	`641`	`}`
`635`	`642`	`}`
`636`	`643`	`if (devPtrSeqOffsetsKV) {`
`@@ -648,6 +655,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`648`	`655`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(`
`649`	`656`	`static_cast<int32_t *>(devPtrSeqOffsetsKV), 1, 0, actual_b,`
`650`	`657`	`/does not matter for single row/ actual_b);`
	`658`	`+ cudaDeviceSynchronize();`
`651`	`659`	`}`
`652`	`660`	`}`
`653`	`661`	`if (devOffsetsK) {`
`@@ -665,6 +673,7 @@ void fused_attn_arbitrary_seqlen_fwd_impl(`
`665`	`673`	`print_tensor_elements_2<<<1, 1, 0, stream>>>(`
`666`	`674`	`static_cast<int64_t *>(devOffsetsK), 1, 0, actual_b,`
`667`	`675`	`/does not matter for single row/ actual_b);`
	`676`	`+ cudaDeviceSynchronize();`
`668`	`677`	`}`
`669`	`678`	`}`
`670`	`679`	`}`