From 317dfe54c9cddb7a0587f810380761ff46e57f91 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Fri, 22 Mar 2024 23:30:45 +0100
Subject: [PATCH] triton_ce executor: don't error if triton is not installed
 (PR2466) (#29)

---
 thunder/executors/triton_crossentropy.py      | 635 +-----------------
 thunder/executors/triton_crossentropy_impl.py | 631 +++++++++++++++++
 thunder/extend/__init__.py                    |   6 +-
 thunder/tests/test_extend.py                  |   5 +-
 4 files changed, 641 insertions(+), 636 deletions(-)
 create mode 100644 thunder/executors/triton_crossentropy_impl.py

diff --git a/thunder/executors/triton_crossentropy.py b/thunder/executors/triton_crossentropy.py
index fc22770434..277e605e78 100644
--- a/thunder/executors/triton_crossentropy.py
+++ b/thunder/executors/triton_crossentropy.py
@@ -1,635 +1,10 @@
-import math
-from enum import Enum
-
-import torch
-
-
-import thunder.torch as ltorch
 from thunder.executors import triton_utils
+from thunder.extend import OperatorExecutor
 
-
-# Requires triton 2.1 or greater
-min_triton_version = "2.1"
 triton_version: None | str = triton_utils.triton_version()
-assert triton_version is not None, f"Trying to import a Triton executor, but Triton is unavailable"
-TRITON_AVAILABLE: bool = triton_utils.is_triton_version_at_least(min_triton_version)
-assert (
-    TRITON_AVAILABLE
-), f"Trying to import a Triton executor, but it requires Triton version {min_triton_version} or greater, and the current Triton version is {triton_version}"
-
-from thunder.extend import OperatorExecutor, register_executor
-
-triton_ex: OperatorExecutor = OperatorExecutor("triton", version=triton_version)
-register_executor(triton_ex)
-
-import triton  # noqa: E402
-import triton.language as tl  # noqa: E402
-
-# Temporarily borrowed from https://github.com/openai/triton
-FORWARD_NUM_STAGES = 1
-
-
-class TritonDtype(Enum):
-    kFP16 = 0
-    kBF16 = 1
-    kFP32 = 2
-    kFP64 = 3
-
-
-_TORCH2DTYPE = {
-    torch.float16: TritonDtype.kFP16,
-    torch.bfloat16: TritonDtype.kBF16,
-    torch.float32: TritonDtype.kFP32,
-    torch.float64: TritonDtype.kFP64,
-}
-_DTYPE2TRITON = {
-    TritonDtype.kFP16: tl.float16,
-    TritonDtype.kBF16: tl.bfloat16,
-    TritonDtype.kFP32: tl.float32,
-    TritonDtype.kFP64: tl.float64,
-}
-
-
-@triton.jit
-def _class_indices_forward(
-    LOGITS,
-    PROBS,
-    IDX,
-    LOSS,
-    weight,
-    N,
-    WEIGHT_BUFFER,
-    smoothing_factor,
-    log_size_logits,
-    WEIGHTS: tl.constexpr,
-    CLASS_INDICES: tl.constexpr,
-    LABEL_SMOOTHING: tl.constexpr,
-    IGNORE_INDEX: tl.constexpr,
-    BUFFER_DTYPE: tl.constexpr,
-    BLOCK: tl.constexpr,
-):
-    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
-    row = tl.program_id(0)
-    cols = tl.arange(0, BLOCK)
-    logit_start_ptrs = LOGITS + row * N
-    logit_ptrs = logit_start_ptrs + cols
-    m_prev = -float("inf")
-    l_prev = 0.0
-    m_prev = m_prev.to(buffer_dtype)
-    l_prev = l_prev.to(buffer_dtype)
-
-    for start_n in range(0, tl.cdiv(N, BLOCK)):
-        row_logits = tl.load(
-            logit_ptrs,
-            mask=cols < N - (start_n * BLOCK),
-            other=-float("inf"),
-        ).to(buffer_dtype)
-
-        m_curr = tl.maximum(tl.max(row_logits, 0), m_prev)
-        l_prev *= tl.exp(m_prev - m_curr)
-        p = tl.exp(row_logits - m_curr)
-        l_curr = tl.sum(p, 0) + l_prev
-        l_prev = l_curr
-        m_prev = m_curr
-        logit_ptrs += BLOCK
-    logit_ptrs = logit_start_ptrs + cols
-    output_ptrs = PROBS + row * N + cols
-    WRIT_PROBS = PROBS + row * N + cols
-    if LABEL_SMOOTHING:
-        sum_total = 0.0
-        sum_total = sum_total.to(buffer_dtype)
-        weights_total = 0.0
-        weights_total = weights_total.to(buffer_dtype)
-        if WEIGHTS:
-            weight_ptr = weight + cols
-
-    l_prev_log = tl.log(l_prev)
-    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
-    for start_n in range(0, tl.cdiv(N, BLOCK)):
-        row_logits = tl.load(
-            logit_ptrs,
-            mask=cols < N - start_n * BLOCK,
-            other=l_prev_log + m_prev,
-        ).to(buffer_dtype)
-        if LABEL_SMOOTHING and WEIGHTS:
-            full_weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
-            weights_total += tl.sum(full_weights_val, 0)
-
-        row_minus_max = row_logits - m_prev
-        log_softmax = l_prev_log - row_minus_max
-
-        if LABEL_SMOOTHING and WEIGHTS:
-            log_softmax *= full_weights_val
-
-        if LABEL_SMOOTHING:
-            sum_total += tl.sum(log_softmax, 0)
-        # Store it back
-
-        tl.store(
-            WRIT_PROBS,
-            log_softmax,
-            mask=cols < N - start_n * BLOCK,
-        )
-        logit_ptrs += BLOCK
-        WRIT_PROBS += BLOCK
-        if LABEL_SMOOTHING and WEIGHTS:
-            weight_ptr += BLOCK
-
-    idx = tl.load(IDX + row)
-    use_class = 0.0
-    if IGNORE_INDEX >= 0:
-        use_class = idx == IGNORE_INDEX
-    READ_PROBS = PROBS + row * N + idx
-    tl.debug_barrier()
-    # write-back loss
-    probs = tl.load(READ_PROBS)
-    if WEIGHTS and not LABEL_SMOOTHING:
-        weight_ptr = weight + idx
-        weights_val = tl.load(weight_ptr)
-        probs = weights_val * probs
-    if LABEL_SMOOTHING:
-        tl.store(WEIGHT_BUFFER + row, weights_total)
-        probs = (1 - smoothing_factor) * probs + smoothing_factor * (sum_total) / N
-    probs = probs * (1.0 - use_class)
-
-    tl.store(LOSS + row, probs)
-
-
-@triton.jit
-def _class_probs_forward(
-    LOGITS,
-    PROBS,
-    IDX,
-    LOSS,
-    weight,
-    N,
-    WEIGHT_BUFFER,
-    smoothing_factor,
-    log_size_logits,
-    WEIGHTS: tl.constexpr,
-    CLASS_INDICES: tl.constexpr,
-    LABEL_SMOOTHING: tl.constexpr,
-    IGNORE_INDEX: tl.constexpr,
-    BUFFER_DTYPE: tl.constexpr,
-    BLOCK: tl.constexpr,
-):
-    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
-    row = tl.program_id(0)
-    cols = tl.arange(0, BLOCK)
-    logit_start_ptrs = LOGITS + row * N
-    logit_ptrs = logit_start_ptrs + cols
-    m_prev = -float("inf")
-    l_prev = 0.0
-    m_prev = m_prev.to(buffer_dtype)
-    l_prev = l_prev.to(buffer_dtype)
-
-    for start_n in range(0, tl.cdiv(N, BLOCK)):
-        row_logits = tl.load(
-            logit_ptrs,
-            mask=cols < N - (start_n * BLOCK),
-            other=-float("inf"),
-        ).to(buffer_dtype)
-
-        m_curr = tl.maximum(tl.max(row_logits, 0), m_prev)
-        l_prev *= tl.exp(m_prev - m_curr)
-        p = tl.exp(row_logits - m_curr)
-        l_curr = tl.sum(p, 0) + l_prev
-        l_prev = l_curr
-        m_prev = m_curr
-        logit_ptrs += BLOCK
-    logit_ptrs = logit_start_ptrs + cols
-    output_ptrs = PROBS + row * N + cols
-    WRIT_PROBS = PROBS + row * N + cols
-
-    sum_total = 0.0
-    weights_total = 0.0
-    sum_total = sum_total.to(buffer_dtype)
-    weights_total = weights_total.to(buffer_dtype)
-    idx_ptr = IDX + row * N + cols
-    if WEIGHTS:
-        weight_ptr = weight + cols
-
-    l_prev_log = tl.log(l_prev)
-    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
-    for start_n in range(0, tl.cdiv(N, BLOCK)):
-        row_logits = tl.load(
-            logit_ptrs,
-            mask=cols < N - start_n * BLOCK,
-            other=l_prev_log + m_prev,
-        ).to(buffer_dtype)
-        idx = tl.load(idx_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
-        full_weights_val = (1.0 - smoothing_factor) * idx + smoothing_factor / N
-        if WEIGHTS:
-            weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
-            full_weights_val = weights_val * full_weights_val
-        else:
-            full_weights_val = tl.where(cols < N - start_n * BLOCK, full_weights_val, 0.0)
-        weights_total += tl.sum(full_weights_val, 0)
-
-        row_minus_max = row_logits - m_prev
-        log_softmax = l_prev_log - row_minus_max
-
-        log_softmax *= full_weights_val
-        sum_total += tl.sum(log_softmax, 0)
-        # Store it back
-
-        tl.store(
-            WRIT_PROBS,
-            log_softmax,
-            mask=cols < N - start_n * BLOCK,
-        )
-        logit_ptrs += BLOCK
-        WRIT_PROBS += BLOCK
-        idx_ptr += BLOCK
-        if WEIGHTS:
-            weight_ptr += BLOCK
-
-    tl.store(WEIGHT_BUFFER + row, weights_total)
-    probs = sum_total
-
-    tl.store(LOSS + row, probs)
-
-
-@triton.autotune(
-    configs=[
-        # fmt: off
-        triton.Config({'BLOCK': 1024}, num_stages=FORWARD_NUM_STAGES, num_warps=1),
-        triton.Config({'BLOCK': 2048}, num_stages=FORWARD_NUM_STAGES, num_warps=8),
-        triton.Config({'BLOCK': 4096}, num_stages=FORWARD_NUM_STAGES, num_warps=8),
-        triton.Config({'BLOCK': 8192}, num_stages=FORWARD_NUM_STAGES, num_warps=16),
-        triton.Config({'BLOCK': 16384}, num_stages=FORWARD_NUM_STAGES, num_warps=16),
-        # fmt: on
-    ],
-    key=[
-        "N",
-        "CLASS_INDICES",
-        "log_size_logits",
-        "BUFFER_DTYPE",
-    ],
-)
-@triton.jit
-def _forward(
-    LOGITS,
-    PROBS,
-    IDX,
-    LOSS,
-    weight,
-    N,
-    WEIGHT_BUFFER,
-    smoothing_factor,
-    log_size_logits,
-    WEIGHTS: tl.constexpr,
-    CLASS_INDICES: tl.constexpr,
-    LABEL_SMOOTHING: tl.constexpr,
-    IGNORE_INDEX: tl.constexpr,
-    BUFFER_DTYPE: tl.constexpr,
-    BLOCK: tl.constexpr,
-):
-    if CLASS_INDICES:
-        _class_indices_forward(
-            LOGITS,
-            PROBS,
-            IDX,
-            LOSS,
-            weight,
-            N,
-            WEIGHT_BUFFER,
-            smoothing_factor,
-            log_size_logits,
-            WEIGHTS,
-            CLASS_INDICES,
-            LABEL_SMOOTHING,
-            IGNORE_INDEX,
-            BUFFER_DTYPE,
-            BLOCK,
-        )
-    else:
-        _class_probs_forward(
-            LOGITS,
-            PROBS,
-            IDX,
-            LOSS,
-            weight,
-            N,
-            WEIGHT_BUFFER,
-            smoothing_factor,
-            log_size_logits,
-            WEIGHTS,
-            CLASS_INDICES,
-            LABEL_SMOOTHING,
-            IGNORE_INDEX,
-            BUFFER_DTYPE,
-            BLOCK,
-        )
-
-
-@triton.autotune(
-    configs=[
-        # fmt: off
-        triton.Config({'BLOCK': 1024}, num_stages=1, num_warps=1),
-        triton.Config({'BLOCK': 2048}, num_stages=1, num_warps=8),
-        triton.Config({'BLOCK': 4096}, num_stages=1, num_warps=8),
-        triton.Config({'BLOCK': 8192}, num_stages=1, num_warps=16),
-        triton.Config({'BLOCK': 16384}, num_stages=1, num_warps=16),
-        # fmt: on
-    ],
-    key=[
-        "N",
-        "CLASS_INDICES",
-        "log_size_logits",
-        "BUFFER_DTYPE",
-    ],
-)
-@triton.jit
-def _backward(
-    PROBS,
-    IDX,
-    DPROBS,
-    dprob_stride,
-    DIN,
-    weight,
-    N,
-    WEIGHT_BUFFER,
-    smoothing_factor,
-    log_size_logits,
-    WEIGHTS: tl.constexpr,
-    CLASS_INDICES: tl.constexpr,
-    LABEL_SMOOTHING: tl.constexpr,
-    IGNORE_INDEX: tl.constexpr,
-    BUFFER_DTYPE: tl.constexpr,
-    BLOCK: tl.constexpr,
-):
-    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
-    row = tl.program_id(0)
-    start_n = tl.program_id(1)
-    cols = tl.arange(0, BLOCK)
-    PROBS = PROBS + row * N
-    # pointers to probs
-    probs_start = PROBS + cols + BLOCK * start_n
-    # for start_n in range(0, tl.cdiv(N, BLOCK)):  # need to change this
-    probs = -tl.load(
-        probs_start,
-        mask=cols < N - (start_n * BLOCK),
-        other=float("inf"),
-    ).to(buffer_dtype)
-    DIN = DIN + row * N + cols + BLOCK * start_n
-    dout = tl.load(DPROBS + row * dprob_stride).to(buffer_dtype)
-    # We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
-    # and we have -log(p[k]) stored in PROBS, so this is easy
-    if CLASS_INDICES:
-        idx = tl.load(IDX + row)
-        delta = ((start_n * BLOCK) + cols) == idx
-        # write result in-place in PROBS
-        if IGNORE_INDEX >= 0:
-            use_class = idx == IGNORE_INDEX
-            dout = dout * (1 - use_class)
-        if LABEL_SMOOTHING:
-            if WEIGHTS:
-                weight_ptr = weight + cols + BLOCK * start_n
-                full_weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0).to(buffer_dtype)
-                weights_val = tl.load(weight + idx)
-                probs = probs / full_weights_val
-            probs = tl.exp(probs)
-            if WEIGHTS:
-                weights_total = tl.load(WEIGHT_BUFFER + row)
-                numerator_contrib = weights_val * (1.0 - smoothing_factor) * (probs - delta)
-                mean_contrib = ((weights_total * probs) - (full_weights_val)) * smoothing_factor / N
-            else:
-                numerator_contrib = (1.0 - smoothing_factor) * (probs - delta)
-                mean_contrib = (smoothing_factor * probs) - (smoothing_factor / N)
-
-            din = (numerator_contrib + mean_contrib) * dout
-
-        else:
-            probs = tl.exp(probs)
-            din = (probs - delta) * dout
-            if WEIGHTS:
-                weight_ptr = weight + idx
-                weights_val = tl.load(weight_ptr)
-                din = weights_val * din
-    else:
-        idx = tl.load(
-            IDX + row * N + cols + BLOCK * start_n,
-            mask=cols < N - start_n * BLOCK,
-            other=0.0,
-        ).to(buffer_dtype)
-        full_weights_val = (1.0 - smoothing_factor) * idx + smoothing_factor / N
-        weights_total = tl.load(WEIGHT_BUFFER + row)
-        if WEIGHTS:
-            weight_ptr = weight + cols + BLOCK * start_n
-            weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0).to(buffer_dtype)
-            full_weights_val = weights_val * full_weights_val
-        probs = probs / full_weights_val
-        probs = tl.exp(probs.to(buffer_dtype))
-        weighted_probs = probs * weights_total
-        weighted_probs_per_class = weighted_probs - full_weights_val
-        din = (weighted_probs_per_class) * dout
-
-    tl.store(DIN, din.to(DIN.dtype.element_ty), mask=cols + BLOCK * start_n < N)
-
-
-class CrossEntropy(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        logits,
-        indices,
-        weight,
-        ignore_index,
-        reduction,
-        label_smoothing,
-    ):
-        buffer_dtype = None
-        # make sure we can use triton
-        # assert (
-        #     indices.dtype == torch.int64
-        # ), "Indices are expected to be of type long."
-        assert weight is None or (len(weight.shape) == 1 and weight.shape[0] == logits.shape[-1])
-        # make kernel
-        if buffer_dtype is None:
-            if logits.dtype in [torch.bfloat16, torch.float16]:
-                buffer_dtype = torch.float32
-            else:
-                buffer_dtype = logits.dtype
-        buffer_dtype_enum = _TORCH2DTYPE[buffer_dtype]
-        device, dtype = logits.device, logits.dtype
-        n_cols = logits.shape[-1]
-        # run the kernel
-        result = torch.empty((logits.shape[0],), dtype=dtype, device=device)
-        # result = torch.empty_like(indices, dtype=dtype, device=device)
-        neg_logprobs = torch.empty_like(logits, dtype=buffer_dtype, device=device)
-        weights_buffer = torch.empty_like(result, dtype=buffer_dtype)
-        grid = lambda opt: (logits.numel() // n_cols,)
-        log_size_logits = int(math.log(math.prod(logits.shape) / n_cols))
-        _forward[grid](
-            logits,
-            neg_logprobs,
-            indices,
-            result,
-            weight,
-            n_cols,
-            weights_buffer,
-            label_smoothing,
-            log_size_logits,
-            WEIGHTS=(weight is not None),
-            CLASS_INDICES=(indices.dtype == torch.int64),
-            LABEL_SMOOTHING=(label_smoothing > 0.0),
-            IGNORE_INDEX=ignore_index,
-            BUFFER_DTYPE=buffer_dtype_enum,
-        )
-        # save for backward
-        ctx.save_for_backward(neg_logprobs, indices, weights_buffer)
-        ctx.WEIGHT = weight
-        ctx.label_smoothing = label_smoothing
-        ctx.ignore_index = ignore_index
-        ctx.reduction = reduction
-        ctx.buffer_dtype = buffer_dtype_enum
-        if reduction == "none":
-            return result
-        elif reduction == "sum":
-            return result.sum(dim=0)
-        elif reduction == "mean":
-            if indices.dtype == torch.int64:
-                denom = (indices != ignore_index).float()
-                if weight is not None:
-                    class_weights = weight[indices]
-                    denom *= class_weights
-                denom = denom.sum()
-            else:
-                denom = indices.shape[0]
-            ctx.denom = denom
-            return (result.sum(dim=0) / denom).to(dtype)
-
-    @staticmethod
-    def backward(ctx, dneg_logprobs):
-        """We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
-        so we initialize the gradient as neg_logprobs, so we can just exponentiate
-        to get p[k], which is most of what we need...  neg_logprobs will be
-        modified in place to become the gradient we want
-        """
-        # load saved tensors
-        reduction = ctx.reduction
-        if reduction == "mean" or reduction == "sum":
-            dneg_logprobs = dneg_logprobs.expand(1)
-        neg_logprobs, indices, weights_buffer = ctx.saved_tensors
-        din = torch.empty_like(neg_logprobs)
-        weight = ctx.WEIGHT
-        buffer_dtype = ctx.buffer_dtype
-        # run the kernel
-        # neg_logprobs will be modified in place to become our gradient:
-        n_cols = neg_logprobs.shape[-1]
-        grid = lambda opt: (
-            neg_logprobs.numel() // n_cols,
-            triton.cdiv(n_cols, opt["BLOCK"]),
-        )
-        log_size_logits = int(math.log(math.prod(neg_logprobs.shape) / n_cols))
-        _backward[grid](
-            neg_logprobs,
-            indices,
-            dneg_logprobs,
-            dneg_logprobs.stride(0),
-            din,
-            weight,
-            n_cols,
-            weights_buffer,
-            ctx.label_smoothing,
-            log_size_logits,
-            WEIGHTS=(weight is not None),
-            CLASS_INDICES=(indices.dtype == torch.int64),
-            LABEL_SMOOTHING=(ctx.label_smoothing > 0.0),
-            IGNORE_INDEX=ctx.ignore_index,
-            BUFFER_DTYPE=buffer_dtype,
-        )
-        if ctx.reduction == "mean":
-            din /= ctx.denom
-        return din, None, None, None, None, None, None
-
-
-def cross_entropy(
-    input,
-    target,
-    weight=None,
-    ignore_index=-100,
-    reduction="mean",
-    label_smoothing=0.0,
-):
-    r"""
-    Returns the Cross Entropy loss of input. If the target is class indcies
-    then the ignore_index argument is applicable, while the label_smoothing argument
-    is not.  On the other hand, if the target is class probabilites, then the
-    label_smoothing argument is applicable, while the ignore_index argument is not.
-
-    Args:
-        input: Tensor of shape (B, N)
-            where B is the batch dim and N is the number of classes
-        target: Int Tensor of shape (B,), min = 0, max = N-1 or
-            Float Tensor of shape (B, N), rows sum to 1.0
-            Int tensor of class labels.
-        weight: Optional, Float Tensor of shape (N,)
-            Weight to scale each class
-        ignore_index: Int, which class label should be ignored
-        reduction: String: ['none', 'sum', 'mean']
-        label_smoothing: Float between 0 and 1
-    """
-    return CrossEntropy.apply(
-        input,
-        target,
-        weight,
-        ignore_index,
-        reduction,
-        label_smoothing,
-    )
-
-
-# TODO: What is correct handling of ignore_index?
-def cross_entropy_impl(
-    a,
-    target,
-    weight=None,
-    size_average=None,
-    ignore_index=-100,
-    reduce=None,
-    reduction="mean",
-    label_smoothing=0.0,
-):
-    loss = cross_entropy(a, target, weight, ignore_index, reduction, label_smoothing)
-
-    return loss
-
-
-def cross_entropy_checker(
-    a,
-    /,
-    target,
-    weight=None,
-    size_average=None,
-    ignore_index=-100,
-    reduce=None,
-    reduction="mean",
-    label_smoothing=0.0,
-) -> bool:
-    if triton is None:
-        return False
-
-    torch_dtype = ltorch.to_torch_dtype(a.dtype)
-    if torch_dtype not in (torch.float16, torch.bfloat16, torch.float32, torch.float64):
-        return False
-
-    # These arguments are deprecated and not supported
-    if size_average is not None or reduce is not None:
-        return False
-
-    # We only support reduction of "sum", "mean" or "none"
-    if reduction not in ["sum", "mean", "none"]:
-        return False
-
-    if len(a.shape) != 2:
-        return False
-
-    return True
-
 
-import thunder.torch as ltorch
+triton_ex: None | OperatorExecutor = None
+if triton_version is not None:
+    from thunder.executors.triton_crossentropy_impl import triton_ex as impl_ex
 
-ce = triton_ex.register_operator("triton_crossentropy", like=ltorch.cross_entropy, fn=cross_entropy_impl)
-triton_ex.register_implementation(ltorch.cross_entropy, ce, checker=cross_entropy_checker)
+    triton_ex = impl_ex
diff --git a/thunder/executors/triton_crossentropy_impl.py b/thunder/executors/triton_crossentropy_impl.py
new file mode 100644
index 0000000000..ff36fcb450
--- /dev/null
+++ b/thunder/executors/triton_crossentropy_impl.py
@@ -0,0 +1,631 @@
+import math
+from enum import Enum
+
+import torch
+
+from thunder.extend import OperatorExecutor, register_executor
+from thunder.executors import triton_utils
+
+# Requires triton 2.1 or greater
+min_triton_version = "2.1"
+
+triton_version: None | str = triton_utils.triton_version()
+TRITON_AVAILABLE: bool = triton_utils.is_triton_version_at_least(min_triton_version)
+assert (
+    TRITON_AVAILABLE
+), f"Trying to import a Triton executor, but it requires Triton version {min_triton_version} or greater, and the current Triton version is {triton_version}"
+
+triton_ex: OperatorExecutor = OperatorExecutor("triton", version=triton_version)
+register_executor(triton_ex)
+
+import triton  # noqa: E402
+import triton.language as tl  # noqa: E402
+
+# Temporarily borrowed from https://github.com/openai/triton
+FORWARD_NUM_STAGES = 1
+
+
+class TritonDtype(Enum):
+    kFP16 = 0
+    kBF16 = 1
+    kFP32 = 2
+    kFP64 = 3
+
+
+_TORCH2DTYPE = {
+    torch.float16: TritonDtype.kFP16,
+    torch.bfloat16: TritonDtype.kBF16,
+    torch.float32: TritonDtype.kFP32,
+    torch.float64: TritonDtype.kFP64,
+}
+_DTYPE2TRITON = {
+    TritonDtype.kFP16: tl.float16,
+    TritonDtype.kBF16: tl.bfloat16,
+    TritonDtype.kFP32: tl.float32,
+    TritonDtype.kFP64: tl.float64,
+}
+
+
+@triton.jit
+def _class_indices_forward(
+    LOGITS,
+    PROBS,
+    IDX,
+    LOSS,
+    weight,
+    N,
+    WEIGHT_BUFFER,
+    smoothing_factor,
+    log_size_logits,
+    WEIGHTS: tl.constexpr,
+    CLASS_INDICES: tl.constexpr,
+    LABEL_SMOOTHING: tl.constexpr,
+    IGNORE_INDEX: tl.constexpr,
+    BUFFER_DTYPE: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK)
+    logit_start_ptrs = LOGITS + row * N
+    logit_ptrs = logit_start_ptrs + cols
+    m_prev = -float("inf")
+    l_prev = 0.0
+    m_prev = m_prev.to(buffer_dtype)
+    l_prev = l_prev.to(buffer_dtype)
+
+    for start_n in range(0, tl.cdiv(N, BLOCK)):
+        row_logits = tl.load(
+            logit_ptrs,
+            mask=cols < N - (start_n * BLOCK),
+            other=-float("inf"),
+        ).to(buffer_dtype)
+
+        m_curr = tl.maximum(tl.max(row_logits, 0), m_prev)
+        l_prev *= tl.exp(m_prev - m_curr)
+        p = tl.exp(row_logits - m_curr)
+        l_curr = tl.sum(p, 0) + l_prev
+        l_prev = l_curr
+        m_prev = m_curr
+        logit_ptrs += BLOCK
+    logit_ptrs = logit_start_ptrs + cols
+    output_ptrs = PROBS + row * N + cols
+    WRIT_PROBS = PROBS + row * N + cols
+    if LABEL_SMOOTHING:
+        sum_total = 0.0
+        sum_total = sum_total.to(buffer_dtype)
+        weights_total = 0.0
+        weights_total = weights_total.to(buffer_dtype)
+        if WEIGHTS:
+            weight_ptr = weight + cols
+
+    l_prev_log = tl.log(l_prev)
+    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
+    for start_n in range(0, tl.cdiv(N, BLOCK)):
+        row_logits = tl.load(
+            logit_ptrs,
+            mask=cols < N - start_n * BLOCK,
+            other=l_prev_log + m_prev,
+        ).to(buffer_dtype)
+        if LABEL_SMOOTHING and WEIGHTS:
+            full_weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
+            weights_total += tl.sum(full_weights_val, 0)
+
+        row_minus_max = row_logits - m_prev
+        log_softmax = l_prev_log - row_minus_max
+
+        if LABEL_SMOOTHING and WEIGHTS:
+            log_softmax *= full_weights_val
+
+        if LABEL_SMOOTHING:
+            sum_total += tl.sum(log_softmax, 0)
+        # Store it back
+
+        tl.store(
+            WRIT_PROBS,
+            log_softmax,
+            mask=cols < N - start_n * BLOCK,
+        )
+        logit_ptrs += BLOCK
+        WRIT_PROBS += BLOCK
+        if LABEL_SMOOTHING and WEIGHTS:
+            weight_ptr += BLOCK
+
+    idx = tl.load(IDX + row)
+    use_class = 0.0
+    if IGNORE_INDEX >= 0:
+        use_class = idx == IGNORE_INDEX
+    READ_PROBS = PROBS + row * N + idx
+    tl.debug_barrier()
+    # write-back loss
+    probs = tl.load(READ_PROBS)
+    if WEIGHTS and not LABEL_SMOOTHING:
+        weight_ptr = weight + idx
+        weights_val = tl.load(weight_ptr)
+        probs = weights_val * probs
+    if LABEL_SMOOTHING:
+        tl.store(WEIGHT_BUFFER + row, weights_total)
+        probs = (1 - smoothing_factor) * probs + smoothing_factor * (sum_total) / N
+    probs = probs * (1.0 - use_class)
+
+    tl.store(LOSS + row, probs)
+
+
+@triton.jit
+def _class_probs_forward(
+    LOGITS,
+    PROBS,
+    IDX,
+    LOSS,
+    weight,
+    N,
+    WEIGHT_BUFFER,
+    smoothing_factor,
+    log_size_logits,
+    WEIGHTS: tl.constexpr,
+    CLASS_INDICES: tl.constexpr,
+    LABEL_SMOOTHING: tl.constexpr,
+    IGNORE_INDEX: tl.constexpr,
+    BUFFER_DTYPE: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK)
+    logit_start_ptrs = LOGITS + row * N
+    logit_ptrs = logit_start_ptrs + cols
+    m_prev = -float("inf")
+    l_prev = 0.0
+    m_prev = m_prev.to(buffer_dtype)
+    l_prev = l_prev.to(buffer_dtype)
+
+    for start_n in range(0, tl.cdiv(N, BLOCK)):
+        row_logits = tl.load(
+            logit_ptrs,
+            mask=cols < N - (start_n * BLOCK),
+            other=-float("inf"),
+        ).to(buffer_dtype)
+
+        m_curr = tl.maximum(tl.max(row_logits, 0), m_prev)
+        l_prev *= tl.exp(m_prev - m_curr)
+        p = tl.exp(row_logits - m_curr)
+        l_curr = tl.sum(p, 0) + l_prev
+        l_prev = l_curr
+        m_prev = m_curr
+        logit_ptrs += BLOCK
+    logit_ptrs = logit_start_ptrs + cols
+    output_ptrs = PROBS + row * N + cols
+    WRIT_PROBS = PROBS + row * N + cols
+
+    sum_total = 0.0
+    weights_total = 0.0
+    sum_total = sum_total.to(buffer_dtype)
+    weights_total = weights_total.to(buffer_dtype)
+    idx_ptr = IDX + row * N + cols
+    if WEIGHTS:
+        weight_ptr = weight + cols
+
+    l_prev_log = tl.log(l_prev)
+    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
+    for start_n in range(0, tl.cdiv(N, BLOCK)):
+        row_logits = tl.load(
+            logit_ptrs,
+            mask=cols < N - start_n * BLOCK,
+            other=l_prev_log + m_prev,
+        ).to(buffer_dtype)
+        idx = tl.load(idx_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
+        full_weights_val = (1.0 - smoothing_factor) * idx + smoothing_factor / N
+        if WEIGHTS:
+            weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0)
+            full_weights_val = weights_val * full_weights_val
+        else:
+            full_weights_val = tl.where(cols < N - start_n * BLOCK, full_weights_val, 0.0)
+        weights_total += tl.sum(full_weights_val, 0)
+
+        row_minus_max = row_logits - m_prev
+        log_softmax = l_prev_log - row_minus_max
+
+        log_softmax *= full_weights_val
+        sum_total += tl.sum(log_softmax, 0)
+        # Store it back
+
+        tl.store(
+            WRIT_PROBS,
+            log_softmax,
+            mask=cols < N - start_n * BLOCK,
+        )
+        logit_ptrs += BLOCK
+        WRIT_PROBS += BLOCK
+        idx_ptr += BLOCK
+        if WEIGHTS:
+            weight_ptr += BLOCK
+
+    tl.store(WEIGHT_BUFFER + row, weights_total)
+    probs = sum_total
+
+    tl.store(LOSS + row, probs)
+
+
+@triton.autotune(
+    configs=[
+        # fmt: off
+        triton.Config({'BLOCK': 1024}, num_stages=FORWARD_NUM_STAGES, num_warps=1),
+        triton.Config({'BLOCK': 2048}, num_stages=FORWARD_NUM_STAGES, num_warps=8),
+        triton.Config({'BLOCK': 4096}, num_stages=FORWARD_NUM_STAGES, num_warps=8),
+        triton.Config({'BLOCK': 8192}, num_stages=FORWARD_NUM_STAGES, num_warps=16),
+        triton.Config({'BLOCK': 16384}, num_stages=FORWARD_NUM_STAGES, num_warps=16),
+        # fmt: on
+    ],
+    key=[
+        "N",
+        "CLASS_INDICES",
+        "log_size_logits",
+        "BUFFER_DTYPE",
+    ],
+)
+@triton.jit
+def _forward(
+    LOGITS,
+    PROBS,
+    IDX,
+    LOSS,
+    weight,
+    N,
+    WEIGHT_BUFFER,
+    smoothing_factor,
+    log_size_logits,
+    WEIGHTS: tl.constexpr,
+    CLASS_INDICES: tl.constexpr,
+    LABEL_SMOOTHING: tl.constexpr,
+    IGNORE_INDEX: tl.constexpr,
+    BUFFER_DTYPE: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    if CLASS_INDICES:
+        _class_indices_forward(
+            LOGITS,
+            PROBS,
+            IDX,
+            LOSS,
+            weight,
+            N,
+            WEIGHT_BUFFER,
+            smoothing_factor,
+            log_size_logits,
+            WEIGHTS,
+            CLASS_INDICES,
+            LABEL_SMOOTHING,
+            IGNORE_INDEX,
+            BUFFER_DTYPE,
+            BLOCK,
+        )
+    else:
+        _class_probs_forward(
+            LOGITS,
+            PROBS,
+            IDX,
+            LOSS,
+            weight,
+            N,
+            WEIGHT_BUFFER,
+            smoothing_factor,
+            log_size_logits,
+            WEIGHTS,
+            CLASS_INDICES,
+            LABEL_SMOOTHING,
+            IGNORE_INDEX,
+            BUFFER_DTYPE,
+            BLOCK,
+        )
+
+
+@triton.autotune(
+    configs=[
+        # fmt: off
+        triton.Config({'BLOCK': 1024}, num_stages=1, num_warps=1),
+        triton.Config({'BLOCK': 2048}, num_stages=1, num_warps=8),
+        triton.Config({'BLOCK': 4096}, num_stages=1, num_warps=8),
+        triton.Config({'BLOCK': 8192}, num_stages=1, num_warps=16),
+        triton.Config({'BLOCK': 16384}, num_stages=1, num_warps=16),
+        # fmt: on
+    ],
+    key=[
+        "N",
+        "CLASS_INDICES",
+        "log_size_logits",
+        "BUFFER_DTYPE",
+    ],
+)
+@triton.jit
+def _backward(
+    PROBS,
+    IDX,
+    DPROBS,
+    dprob_stride,
+    DIN,
+    weight,
+    N,
+    WEIGHT_BUFFER,
+    smoothing_factor,
+    log_size_logits,
+    WEIGHTS: tl.constexpr,
+    CLASS_INDICES: tl.constexpr,
+    LABEL_SMOOTHING: tl.constexpr,
+    IGNORE_INDEX: tl.constexpr,
+    BUFFER_DTYPE: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    buffer_dtype = _DTYPE2TRITON[BUFFER_DTYPE.value]
+    row = tl.program_id(0)
+    start_n = tl.program_id(1)
+    cols = tl.arange(0, BLOCK)
+    PROBS = PROBS + row * N
+    # pointers to probs
+    probs_start = PROBS + cols + BLOCK * start_n
+    # for start_n in range(0, tl.cdiv(N, BLOCK)):  # need to change this
+    probs = -tl.load(
+        probs_start,
+        mask=cols < N - (start_n * BLOCK),
+        other=float("inf"),
+    ).to(buffer_dtype)
+    DIN = DIN + row * N + cols + BLOCK * start_n
+    dout = tl.load(DPROBS + row * dprob_stride).to(buffer_dtype)
+    # We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
+    # and we have -log(p[k]) stored in PROBS, so this is easy
+    if CLASS_INDICES:
+        idx = tl.load(IDX + row)
+        delta = ((start_n * BLOCK) + cols) == idx
+        # write result in-place in PROBS
+        if IGNORE_INDEX >= 0:
+            use_class = idx == IGNORE_INDEX
+            dout = dout * (1 - use_class)
+        if LABEL_SMOOTHING:
+            if WEIGHTS:
+                weight_ptr = weight + cols + BLOCK * start_n
+                full_weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0).to(buffer_dtype)
+                weights_val = tl.load(weight + idx)
+                probs = probs / full_weights_val
+            probs = tl.exp(probs)
+            if WEIGHTS:
+                weights_total = tl.load(WEIGHT_BUFFER + row)
+                numerator_contrib = weights_val * (1.0 - smoothing_factor) * (probs - delta)
+                mean_contrib = ((weights_total * probs) - (full_weights_val)) * smoothing_factor / N
+            else:
+                numerator_contrib = (1.0 - smoothing_factor) * (probs - delta)
+                mean_contrib = (smoothing_factor * probs) - (smoothing_factor / N)
+
+            din = (numerator_contrib + mean_contrib) * dout
+
+        else:
+            probs = tl.exp(probs)
+            din = (probs - delta) * dout
+            if WEIGHTS:
+                weight_ptr = weight + idx
+                weights_val = tl.load(weight_ptr)
+                din = weights_val * din
+    else:
+        idx = tl.load(
+            IDX + row * N + cols + BLOCK * start_n,
+            mask=cols < N - start_n * BLOCK,
+            other=0.0,
+        ).to(buffer_dtype)
+        full_weights_val = (1.0 - smoothing_factor) * idx + smoothing_factor / N
+        weights_total = tl.load(WEIGHT_BUFFER + row)
+        if WEIGHTS:
+            weight_ptr = weight + cols + BLOCK * start_n
+            weights_val = tl.load(weight_ptr, mask=cols < N - start_n * BLOCK, other=0.0).to(buffer_dtype)
+            full_weights_val = weights_val * full_weights_val
+        probs = probs / full_weights_val
+        probs = tl.exp(probs.to(buffer_dtype))
+        weighted_probs = probs * weights_total
+        weighted_probs_per_class = weighted_probs - full_weights_val
+        din = (weighted_probs_per_class) * dout
+
+    tl.store(DIN, din.to(DIN.dtype.element_ty), mask=cols + BLOCK * start_n < N)
+
+
+class CrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        logits,
+        indices,
+        weight,
+        ignore_index,
+        reduction,
+        label_smoothing,
+    ):
+        buffer_dtype = None
+        # make sure we can use triton
+        # assert (
+        #     indices.dtype == torch.int64
+        # ), "Indices are expected to be of type long."
+        assert weight is None or (len(weight.shape) == 1 and weight.shape[0] == logits.shape[-1])
+        # make kernel
+        if buffer_dtype is None:
+            if logits.dtype in [torch.bfloat16, torch.float16]:
+                buffer_dtype = torch.float32
+            else:
+                buffer_dtype = logits.dtype
+        buffer_dtype_enum = _TORCH2DTYPE[buffer_dtype]
+        device, dtype = logits.device, logits.dtype
+        n_cols = logits.shape[-1]
+        # run the kernel
+        result = torch.empty((logits.shape[0],), dtype=dtype, device=device)
+        # result = torch.empty_like(indices, dtype=dtype, device=device)
+        neg_logprobs = torch.empty_like(logits, dtype=buffer_dtype, device=device)
+        weights_buffer = torch.empty_like(result, dtype=buffer_dtype)
+        grid = lambda opt: (logits.numel() // n_cols,)
+        log_size_logits = int(math.log(math.prod(logits.shape) / n_cols))
+        _forward[grid](
+            logits,
+            neg_logprobs,
+            indices,
+            result,
+            weight,
+            n_cols,
+            weights_buffer,
+            label_smoothing,
+            log_size_logits,
+            WEIGHTS=(weight is not None),
+            CLASS_INDICES=(indices.dtype == torch.int64),
+            LABEL_SMOOTHING=(label_smoothing > 0.0),
+            IGNORE_INDEX=ignore_index,
+            BUFFER_DTYPE=buffer_dtype_enum,
+        )
+        # save for backward
+        ctx.save_for_backward(neg_logprobs, indices, weights_buffer)
+        ctx.WEIGHT = weight
+        ctx.label_smoothing = label_smoothing
+        ctx.ignore_index = ignore_index
+        ctx.reduction = reduction
+        ctx.buffer_dtype = buffer_dtype_enum
+        if reduction == "none":
+            return result
+        elif reduction == "sum":
+            return result.sum(dim=0)
+        elif reduction == "mean":
+            if indices.dtype == torch.int64:
+                denom = (indices != ignore_index).float()
+                if weight is not None:
+                    class_weights = weight[indices]
+                    denom *= class_weights
+                denom = denom.sum()
+            else:
+                denom = indices.shape[0]
+            ctx.denom = denom
+            return (result.sum(dim=0) / denom).to(dtype)
+
+    @staticmethod
+    def backward(ctx, dneg_logprobs):
+        """We know d(-log(p[i])/dlogit[k] = -id_mat[i,k] + p[k]
+        so we initialize the gradient as neg_logprobs, so we can just exponentiate
+        to get p[k], which is most of what we need...  neg_logprobs will be
+        modified in place to become the gradient we want
+        """
+        # load saved tensors
+        reduction = ctx.reduction
+        if reduction == "mean" or reduction == "sum":
+            dneg_logprobs = dneg_logprobs.expand(1)
+        neg_logprobs, indices, weights_buffer = ctx.saved_tensors
+        din = torch.empty_like(neg_logprobs)
+        weight = ctx.WEIGHT
+        buffer_dtype = ctx.buffer_dtype
+        # run the kernel
+        # neg_logprobs will be modified in place to become our gradient:
+        n_cols = neg_logprobs.shape[-1]
+        grid = lambda opt: (
+            neg_logprobs.numel() // n_cols,
+            triton.cdiv(n_cols, opt["BLOCK"]),
+        )
+        log_size_logits = int(math.log(math.prod(neg_logprobs.shape) / n_cols))
+        _backward[grid](
+            neg_logprobs,
+            indices,
+            dneg_logprobs,
+            dneg_logprobs.stride(0),
+            din,
+            weight,
+            n_cols,
+            weights_buffer,
+            ctx.label_smoothing,
+            log_size_logits,
+            WEIGHTS=(weight is not None),
+            CLASS_INDICES=(indices.dtype == torch.int64),
+            LABEL_SMOOTHING=(ctx.label_smoothing > 0.0),
+            IGNORE_INDEX=ctx.ignore_index,
+            BUFFER_DTYPE=buffer_dtype,
+        )
+        if ctx.reduction == "mean":
+            din /= ctx.denom
+        return din, None, None, None, None, None, None
+
+
+def cross_entropy(
+    input,
+    target,
+    weight=None,
+    ignore_index=-100,
+    reduction="mean",
+    label_smoothing=0.0,
+):
+    r"""
+    Returns the Cross Entropy loss of input. If the target is class indcies
+    then the ignore_index argument is applicable, while the label_smoothing argument
+    is not.  On the other hand, if the target is class probabilites, then the
+    label_smoothing argument is applicable, while the ignore_index argument is not.
+
+    Args:
+        input: Tensor of shape (B, N)
+            where B is the batch dim and N is the number of classes
+        target: Int Tensor of shape (B,), min = 0, max = N-1 or
+            Float Tensor of shape (B, N), rows sum to 1.0
+            Int tensor of class labels.
+        weight: Optional, Float Tensor of shape (N,)
+            Weight to scale each class
+        ignore_index: Int, which class label should be ignored
+        reduction: String: ['none', 'sum', 'mean']
+        label_smoothing: Float between 0 and 1
+    """
+    return CrossEntropy.apply(
+        input,
+        target,
+        weight,
+        ignore_index,
+        reduction,
+        label_smoothing,
+    )
+
+
+# TODO: What is correct handling of ignore_index?
+def cross_entropy_impl(
+    a,
+    target,
+    weight=None,
+    size_average=None,
+    ignore_index=-100,
+    reduce=None,
+    reduction="mean",
+    label_smoothing=0.0,
+):
+    loss = cross_entropy(a, target, weight, ignore_index, reduction, label_smoothing)
+
+    return loss
+
+
+def cross_entropy_checker(
+    a,
+    /,
+    target,
+    weight=None,
+    size_average=None,
+    ignore_index=-100,
+    reduce=None,
+    reduction="mean",
+    label_smoothing=0.0,
+) -> bool:
+    if triton is None:
+        return False
+
+    torch_dtype = ltorch.to_torch_dtype(a.dtype)
+    if torch_dtype not in (torch.float16, torch.bfloat16, torch.float32, torch.float64):
+        return False
+
+    # These arguments are deprecated and not supported
+    if size_average is not None or reduce is not None:
+        return False
+
+    # We only support reduction of "sum", "mean" or "none"
+    if reduction not in ["sum", "mean", "none"]:
+        return False
+
+    if len(a.shape) != 2:
+        return False
+
+    return True
+
+
+import thunder.torch as ltorch
+
+ce = triton_ex.register_operator("triton_crossentropy", like=ltorch.cross_entropy, fn=cross_entropy_impl)
+triton_ex.register_implementation(ltorch.cross_entropy, ce, checker=cross_entropy_checker)
diff --git a/thunder/extend/__init__.py b/thunder/extend/__init__.py
index 2acdcf6427..a1fd0c8361 100644
--- a/thunder/extend/__init__.py
+++ b/thunder/extend/__init__.py
@@ -306,13 +306,9 @@ def get_all_executors() -> tuple[Executor]:
         torch_compile,
         torchex,
         transformer_engineex,
+        triton_crossentropy,
     )
 
-    if torch.cuda.is_available():
-        # raise an error when a dependency is not available at import time
-        # TODO: this should only happen at runtime
-        from thunder.executors import triton_crossentropy
-
     return tuple(_executor_map.values())
 
 
diff --git a/thunder/tests/test_extend.py b/thunder/tests/test_extend.py
index b7dd300d12..592943d9e7 100644
--- a/thunder/tests/test_extend.py
+++ b/thunder/tests/test_extend.py
@@ -10,6 +10,7 @@
 from thunder.core.proxies import TensorProxy
 from thunder.core.transforms import grad, get_grad, put_grads
 from thunder.extend import OperatorExecutor, register_executor, deregister_executor, get_all_executors
+from lightning_utilities.core.imports import package_available
 
 
 def test_extend_core():
@@ -127,7 +128,9 @@ def test_get_all_executors_includes_all_native_executors():
         "python",
         "transformer_engine",
     }
-    actual.discard("triton")  # remove when triton can always be imported
+    if package_available("triton"):
+        # `triton` maybe installed on a system without GPU.
+        expected.update({"triton"})
     if torch.cuda.is_available():
         expected.update({"nvfuser"})
     assert actual == expected