Integration with Triton Inference Server #311

WissamAntoun · 2023-03-07T23:30:39Z

WissamAntoun
Mar 7, 2023

it would really be interesting to see if kernl works well running inside the Triton Inference Server with the python backend. I'm not sure if you tested this way of deploying the model, since it would fit well with transformer-deploy.

So far i did a quick test, which required changing the setup.py to support python 3.8, which also revealed few issues with the list and tuple type hints in https://github.com/ELS-RD/kernl/blob/main/src/kernl/optimizer/cuda_graph.py#L29 for example. After i fixed those issues, i ran into CUDA error: operation not permitted when stream is capturing .

I was using a T5 model running inside huggingface text generation pipelines and tritonserver:22.10

This is the model.py file i was using

model.py

import json
import os
import re
import traceback
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pyarabic.araby as araby
import torch
import torch._dynamo as torchdynamo
import triton_python_backend_utils as pb_utils
from transformers import T5ForConditionalGeneration, T5TokenizerFast, pipeline

from kernl.model_optimization import optimize_model

# triton_python_backend_utils is available in every Triton Python model. You
# need to use this module to create inference requests and responses. It also
# contains some utility functions for extracting information from model_config
# and converting Triton input/output types to numpy types.

# default cache size needs to be increased to store the many graphs with generative models
torchdynamo.config.cache_size_limit = 1024


class TritonPythonModel:
    """Your Python model must use the same class name. Every Python model
    that is created must have "TritonPythonModel" as the class name.
    """

    def initialize(self, args):
        """`initialize` is called only once when the model is being loaded.
        Implementing `initialize` function is optional. This function allows
        the model to intialize any state associated with this model.

        Parameters
        ----------
        args : dict
          Both keys and values are strings. The dictionary keys and values are:
          * model_config: A JSON string containing the model configuration
          * model_instance_kind: A string containing model instance kind
          * model_instance_device_id: A string containing model instance device ID
          * model_repository: Model repository path
          * model_version: Model version
          * model_name: Model name
        """

        # You must parse model_config. JSON string is not parsed here
        self.model_config = json.loads(args["model_config"])
        self.model_name = args["model_name"].replace("_prep", "")
        self.model_version = str(args["model_version"])

        # Get ids configuration
        output_json_config = pb_utils.get_output_config_by_name(
            self.model_config, "JSON_OUTPUT"
        )

        # Convert Triton types to numpy types
        self.output_json_dtype = pb_utils.triton_string_to_numpy(
            output_json_config["data_type"]
        )

        self.t5_model = T5ForConditionalGeneration.from_pretrained(
            os.path.dirname(__file__)
        )
        self.t5_model = self.t5_model.eval().cuda()

        self.t5_tokenizer = T5TokenizerFast.from_pretrained(os.path.dirname(__file__))

        # optimize model
        optimize_model(self.t5_model.encoder)
        optimize_model(self.t5_model.decoder)

        self.warmup()

        self.t5_pipe = pipeline(
            "text2text-generation",
            model=self.t5_model,
            tokenizer=self.t5_tokenizer,
            device=0,
        )

        self.prefix = "do something "

        logger = pb_utils.Logger

    def warmup(
        self,
    ):
        # warmup (IRL, encoder and decoder should be warmed each on their own)
        with torch.inference_mode(), torch.autocast(
            dtype=torch.float16, cache_enabled=True, device_type="cuda"
        ):
            pb_utils.Logger.log_info(f"Warming up model {self.model_name}")
            input_ids = self.t5_tokenizer(
                "translate English to French: The house in the woods is wonderful, can we buy it ?",
                return_tensors="pt",
                pad_to_multiple_of=8,
                padding=True,
            ).to("cuda")
            self.t5_model.generate(
                inputs=input_ids["input_ids"], min_length=22, max_length=22
            )
            pb_utils.Logger.log_info(f"Model warmed up {self.model_name}")

    def execute(self, requests):
        output_json_dtype = self.output_json_dtype

        logger = pb_utils.Logger

        logger.log_info(
            "Received requests on model={} with batch_size={}".format(
                self.model_config["name"], len(requests)
            )
        )

        responses = []

        # Every Python backend must iterate over everyone of the requests
        # and create a pb_utils.InferenceResponse for each of them.
        for request in requests:
            try:
                text = pb_utils.get_input_tensor_by_name(request, "INPUT_TEXT")
                text = text.as_numpy().reshape(-1)
                text = [x.decode("utf-8") for x in text.tolist()]

                input_params = pb_utils.get_input_tensor_by_name(
                    request, "INPUT_PARAMS_JSON"
                )
                input_params = input_params.as_numpy().reshape(-1)
                input_params: List[Dict[str, Any]] = [
                    json.loads(x.decode("utf-8")) for x in input_params.tolist()
                ]

                logger.log_info(
                    "Processing request on model={} with id={} and request_batch_size={}".format(
                        self.model_config["name"], request.request_id(), len(text)
                    )
                )

                with torch.inference_mode(), torch.autocast(
                    dtype=torch.float16, cache_enabled=True, device_type="cuda"
                ):
                    # check if all params are the same
                    if len(set([json.dumps(x) for x in input_params])) == 1:
                        logger.log_info(
                            "Request on model={} with id={} is batchable".format(
                                self.model_config["name"], request.request_id()
                            )
                        )
                        claim = self.t5_pipe(
                            text,
                            **input_params[0],
                        )

                    else:
                        logger.log_info(
                            "Request on model={} with id={} is not batchable".format(
                                self.model_config["name"], request.request_id()
                            )
                        )
                        claim = []
                        for i, t in enumerate(text):
                            logger.log_info(
                                "Processing unbatched request on model={} with id={} and request_batch_id={} out of {}".format(
                                    self.model_config["name"],
                                    request.request_id(),
                                    i,
                                    len(text),
                                )
                            )
                            claim.append(
                                self.t5_pipe(
                                    t,
                                    **input_params[i],
                                )
                            )

                claim = [
                    [claim_] if not isinstance(claim_, list) else claim_
                    for claim_ in claim
                ]

                final_output_np = np.array(
                    [json.dumps(claim_) for claim_ in claim], dtype=np.object_
                )

                final_output_tensor = pb_utils.Tensor(
                    "JSON_OUTPUT",
                    final_output_np,
                )
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[final_output_tensor]
                )

            except Exception as e:
                # Create InferenceResponse. You can set an error here in case
                # there was a problem with handling this inference request.
                # Below is an example of how you can set errors in inference
                # response:
                inference_response = pb_utils.InferenceResponse(
                    output_tensors=[],
                    error=pb_utils.TritonError(traceback.format_exc()),
                )
                logger.log_error(
                    "ERROR on model={} request with id={}\n".format(
                        self.model_config["name"],
                        request.request_id(),
                    )
                    + "---------------------------------Traceback---------------------------------\n"
                    + traceback.format_exc()
                    + "-----------------------------End-Traceback---------------------------------\n"
                )

            logger.log_info(
                "Finished processing request on model={} with id={}".format(
                    self.model_config["name"], request.request_id()
                )
            )
            responses.append(inference_response)

        # You should return a list of pb_utils.InferenceResponse. Length
        # of this list must match the length of `requests` list.
        return responses

    def finalize(self):
        """`finalize` is called only once when the model is being unloaded.
        Implementing `finalize` function is OPTIONAL. This function allows
        the model to perform any necessary clean ups before exit.
        """
        logger = pb_utils.Logger
        logger.log_info("Stopping t5_model...")

Error logs

I0307 23:47:29.599013 1 pb_stub.cc:313] Failed to initialize Python stub: RuntimeError: CUDA error: operation failed due to a previous error during capture
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


At:
  /usr/local/lib/python3.8/dist-packages/torch/cuda/graphs.py(82): capture_end
  /usr/local/lib/python3.8/dist-packages/torch/cuda/graphs.py(176): __exit__
  /usr/local/lib/python3.8/dist-packages/torch/_inductor/compile_fx.py(300): cudagraphify_impl
  /usr/local/lib/python3.8/dist-packages/kernl/optimizer/cuda_graph.py(146): run
  /usr/local/lib/python3.8/dist-packages/torch/_dynamo/eval_frame.py(215): _fn
  /usr/local/lib/python3.8/dist-packages/transformers/models/t5/modeling_t5.py(894): forward
  /usr/local/lib/python3.8/dist-packages/kernl/model_optimization.py(64): run
  /usr/local/lib/python3.8/dist-packages/torch/_dynamo/eval_frame.py(215): _fn
  /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1488): _call_impl
  /usr/local/lib/python3.8/dist-packages/transformers/models/t5/modeling_t5.py(1639): forward
  /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1488): _call_impl
  /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py(1689): greedy_search
  /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py(1294): generate
  /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py(115): decorate_context
  /models/t5_model/2/model.py(211): warmup
  /models/t5_model/2/model.py(138): initialize

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/fx/graph_module.py", line 269, in __call__
    return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1488, in _call_impl
    return forward_call(*args, **kwargs)
  File "<eval_with_key>.387", line 10, in forward
    to = ones.to(device(type='cuda', index=0));  ones = None
RuntimeError: CUDA error: operation not permitted when stream is capturing
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Call using an FX-traced Module, line 10 of the traced Module's generated forward function:
    ones = torch.ones(1, 1)
    to = ones.to(device(type='cuda', index=0));  ones = None

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    arange = torch.arange(1, device = device(type='cuda', index=0))

    getitem = arange[(None, None, slice(None, None, None))]

I0307 23:47:30.301594 1 python_be.cc:1996] TRITONBACKEND_ModelInstanceFinalize: delete instance state
I0307 23:47:30.305641 1 python_be.cc:1835] TRITONBACKEND_ModelFinalize: delete model state
E0307 23:47:30.305727 1 model_lifecycle.cc:596] failed to load 't5_model' version 2: Internal: RuntimeError: CUDA error: operation failed due to a previous error during capture
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


At:
  /usr/local/lib/python3.8/dist-packages/torch/cuda/graphs.py(82): capture_end
  /usr/local/lib/python3.8/dist-packages/torch/cuda/graphs.py(176): __exit__
  /usr/local/lib/python3.8/dist-packages/torch/_inductor/compile_fx.py(300): cudagraphify_impl
  /usr/local/lib/python3.8/dist-packages/kernl/optimizer/cuda_graph.py(146): run
  /usr/local/lib/python3.8/dist-packages/torch/_dynamo/eval_frame.py(215): _fn
  /usr/local/lib/python3.8/dist-packages/transformers/models/t5/modeling_t5.py(894): forward
  /usr/local/lib/python3.8/dist-packages/kernl/model_optimization.py(64): run
  /usr/local/lib/python3.8/dist-packages/torch/_dynamo/eval_frame.py(215): _fn
  /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1488): _call_impl
  /usr/local/lib/python3.8/dist-packages/transformers/models/t5/modeling_t5.py(1639): forward
  /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1488): _call_impl
  /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py(1689): greedy_search
  /usr/local/lib/python3.8/dist-packages/transformers/generation_utils.py(1294): generate
  /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py(115): decorate_context
  /models/t5_model/2/model.py(211): warmup
  /models/t5_model/2/model.py(138): initialize

I0307 23:47:30.306019 1 server.cc:563]

pommedeterresautee · 2023-03-08T07:27:23Z

pommedeterresautee
Mar 8, 2023
Maintainer

We have not yet tested it. We are wondering if PyServe could also be a good choice. We really like triton server but it has some limitations regarding the Python engine and we are wondering if it's a goods choice as obviously we won't leverage all the engines etc.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integration with Triton Inference Server #311

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment

{{title}}

Select a reply

Integration with Triton Inference Server #311

WissamAntoun Mar 7, 2023

Replies: 1 comment

pommedeterresautee Mar 8, 2023 Maintainer

WissamAntoun
Mar 7, 2023

pommedeterresautee
Mar 8, 2023
Maintainer