diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 852a4f3..72ffe41 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -11,6 +11,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: DeterminateSystems/nix-installer-action@v10 + with: + extra-conf: cores = 12 - name: Authenticate to Google Cloud Platform uses: google-github-actions/auth@v2 with: @@ -37,6 +39,8 @@ jobs: run: | nix build --accept-flake-config ".#cog-triton-builder" -o cog-triton-builder ./cog-triton-builder push r8.im/replicate-internal/triton-builder + nix build --accept-flake-config ".#cog-triton-builder-h100" -o cog-triton-builder-h100 + ./cog-triton-builder-h100 push r8.im/replicate-internal/triton-builder-h100 - name: Build cog-triton-runner-80 env: COG_TOKEN: ${{ secrets.COG_TOKEN }} diff --git a/cog-trt-llm/trt_llm_builder.py b/cog-trt-llm/trt_llm_builder.py index 1b49934..88ad6be 100644 --- a/cog-trt-llm/trt_llm_builder.py +++ b/cog-trt-llm/trt_llm_builder.py @@ -111,6 +111,9 @@ def _assemble_subprocess_cmd(self, executable, args, script=None): elif executable == "trtllm-build": cmd = [executable] + if "TRTLLM_PYTHON" in os.environ: + cmd[0] = os.path.join(os.environ["TRTLLM_PYTHON"], "bin", cmd[0]) + for k, v in args.items(): cmd += ["--" + str(k)] cmd += [str(v)] if v else [] diff --git a/configs/example_official_model_config.yaml b/configs/example_official_model_config.yaml index 89cb36a..21e13f5 100644 --- a/configs/example_official_model_config.yaml +++ b/configs/example_official_model_config.yaml @@ -28,12 +28,13 @@ instantiate: max_queue_delay_microseconds: 100 max_attention_window_size: 4096 kv_cache_free_gpu_mem_fraction: 0.95 + max_queue_size: 0 postprocessing: args: tokenizer_dir: /src/triton_model_repo/tensorrt_llm/1/ - tokenizer_type: llama + tokenizer_type: auto triton_max_batch_size: 64 postprocessing_instance_count: 64 diff --git a/default.nix b/default.nix index c3a72ee..aae162c 100644 --- a/default.nix +++ b/default.nix @@ -2,50 +2,76 @@ let deps = config.deps; python3 = config.python-env.deps.python; - cudaPackages = pkgs.cudaPackages_12_1; + inherit (config.cognix) cudaPackages; site = python3.sitePackages; pythonDrvs = config.python-env.pip.drvs; inherit (pkgs) lib; cfg = config.cog-triton; # defined in interface.nix + trtllm-env = config.python-env.public.extendModules { + modules = [{ + _file = ./.; + pip.rootDependencies = lib.mkOverride 49 { tensorrt-llm = true; hf-transfer = true; }; + pip.drvs.pydantic = let mkMoreForce = lib.mkOverride 49; in { + version = mkMoreForce "2.8.2"; + mkDerivation.src = mkMoreForce (pkgs.fetchurl { + sha256 = "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"; + url = "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl"; + }); + }; + }]; + }; + trtllm-pythonDrvs = trtllm-env.config.pip.drvs; + toCudaCapability = cmakeArch: { + "70-real" = "7.0"; + "80-real" = "8.0"; + "86-real" = "8.6"; + "89-real" = "8.9"; + "90-real" = "9.0"; + }.${cmakeArch}; in { imports = [ ./interface.nix ]; cog.build = { python_version = "3.10"; - cog_version = "0.10.0-alpha16"; + cog_version = "0.10.0-alpha18"; cuda = "12.1"; # todo: 12.2 gpu = true; - # inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line + # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.9.0" - "torch==2.2.2" - "tensorrt==9.3.0.post12.dev1" - "tensorrt-bindings==9.3.0.post12.dev1" - "tensorrt-libs==9.3.0.post12.dev1" - "nvidia-pytriton==0.5.2" # corresponds to 2.42.0 - "httpx" - "nvidia-cublas-cu12<12.2" - "nvidia-cuda-nvrtc-cu12<12.2" - "nvidia-cuda-runtime-cu12<12.2" + "tensorrt_llm==0.12.0.dev2024073000" + "tensorrt-cu12==10.2.0.post1" + "torch==2.3.1" + "nvidia-pytriton==0.5.8" # corresponds to 2.46.0 "omegaconf" "hf-transfer" - "tokenizers" + "tokenizers>=0.19.0" ]; # don't ask why it needs ssh system_packages = [ "pget" "openssh" "openmpi" ]; }; + # patch in cuda packages from nixpkgs + cognix.merge-native = { + cudnn = "force"; + cublas = true; + }; python-env.pip = { - uv.enable = true; - # todo: add some constraints to match cudaPackages constraintsList = [ - "nvidia-cudnn-cu12<9" + "datasets>2.15.0" # picks older fsspec but newer datasets + "mpi4py<4" # recent release with breaking changes ]; + # HACK: cog requires pydantic <2, but we do need the extra deps pydantic2 brings in overridesList = [ - "tokenizers==0.19.0" - "transformers==4.40.0" + "pydantic>=2.0" ]; + drvs.pydantic = { + version = lib.mkForce "1.10.17"; + mkDerivation.src = pkgs.fetchurl { + sha256 ="371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e"; + url = "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"; + }; + }; }; cognix.includeNix = true; cognix.nix.extraOptions = '' @@ -53,30 +79,45 @@ in extra-substituters = https://storage.googleapis.com/replicate-nix-cache-dev/ ''; python-env.pip.drvs = { + + torch.public = lib.mkIf cfg.torchSourceBuild + (lib.mkForce config.deps.minimal-torch); + tensorrt-llm.public = lib.mkIf cfg.trtllmSourceBuild + (lib.mkForce config.deps.tensorrt-llm.override { + withPython = true; + }); + + nvidia-modelopt.mkDerivation.propagatedBuildInputs = [ + pythonDrvs.setuptools.public + ]; # tensorrt likes doing a pip invocation from it's setup.py # circumvent by manually depending on tensorrt_libs, tensorrt_bindings # and setting this env variable - tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; - # TODO remove upon next rebuild: - tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public - tensorrt-bindings.public + tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; + tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ]; + tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [ + pythonDrvs.tensorrt-cu12-libs.public ]; - tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ]; - tensorrt-libs.mkDerivation.postFixup = '' + # fixes tensorrt-llm build + tensorrt-cu12-libs.mkDerivation.postFixup = '' pushd $out/${site}/tensorrt_libs - ln -s libnvinfer.so.9 libnvinfer.so - ln -s libnvonnxparser.so.9 libnvonnxparser.so + ln -s libnvinfer.so.10 libnvinfer.so + ln -s libnvonnxparser.so.10 libnvonnxparser.so popd ''; - tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; + tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; tensorrt-llm = { mkDerivation.buildInputs = [ cudaPackages.nccl ]; mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public # libnvinfer, onnxparse + tensorrt-cu12-libs.public # libnvinfer, onnxparse ]; env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; - env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"]; + env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ]; + mkDerivation.postInstall = '' + pushd $out/${site}/tensorrt_llm/bin + patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker + popd + ''; }; # has some binaries that want cudart tritonclient.mkDerivation.postInstall = "rm -r $out/bin"; @@ -96,23 +137,10 @@ in done popd ''; - # patch in cuda packages from nixpkgs - nvidia-cublas-cu12.mkDerivation.postInstall = '' - pushd $out/${python3.sitePackages}/nvidia/cublas/lib - for f in ./*.so.12; do - chmod +w "$f" - rm $f - ln -s ${cudaPackages.libcublas.lib}/lib/$f ./$f - done - popd - ''; - nvidia-cudnn-cu12.mkDerivation.postInstall = '' - pushd $out/${python3.sitePackages}/nvidia/cudnn/lib - for f in ./*.so.8; do - chmod +w "$f" - rm $f - ln -s ${cudaPackages.cudnn.lib}/lib/$f ./$f - done + mpi4py.mkDerivation.nativeBuildInputs = [ pkgs.removeReferencesTo ]; + mpi4py.mkDerivation.postInstall = '' + pushd $out/${site}/mpi4py + remove-references-to -t ${pkgs.openmpi.dev} mpi.cfg MPI.*.so popd ''; }; @@ -131,27 +159,46 @@ in deps.tensorrt-src = pkgs.fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT"; - rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch - hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE="; + rev = "v10.2.0"; + hash = "sha256-Euo9VD4VTpx8XJV97IMETTAx/YkPGXiNdA39Wjp3UMU="; }; - # todo: replace with lockfile - deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec { - pname = "pybind11-stubgen"; - version = "2.5"; - src = pkgs.fetchPypi { - inherit pname version; - hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ="; + # make a python3 environment with all the pkgs from lock.json *and* nixpkgs.python + # mainly used to build torch, which additionally requires astunparse + deps.python3-with-nixpkgs = python3.override { + packageOverrides = pyself: pysuper: (lib.mapAttrs (_: v: v.public.out) trtllm-pythonDrvs) // { + # todo: replace with lockfile? + pybind11-stubgen = pyself.buildPythonPackage rec { + pname = "pybind11-stubgen"; + version = "2.5"; + src = pyself.fetchPypi { + inherit pname version; + hash = "sha256-lqf+vKski/mKvUu3LMX3KbqHsjRCR0VMF1nmPN6f7zQ="; + }; + }; + # prevent infinite loop, don't override torch itself + inherit (pysuper) torch; }; }; deps.tensorrt-llm = pkgs.callPackage ./nix/tensorrt-llm.nix { - inherit python3 cudaPackages pythonDrvs; - # TODO: turn into config option + inherit python3 cudaPackages; + pythonDrvs = config.deps.trtllm-env.config.pip.drvs; withPython = false; inherit (cfg) architectures; - inherit (deps) pybind11-stubgen tensorrt-src; + inherit (deps.python3-with-nixpkgs.pkgs) pybind11-stubgen; + inherit (deps) tensorrt-src; }; + deps.trtllm-env = trtllm-env; deps.trtllm-backend = pkgs.callPackage ./nix/trtllm-backend.nix { inherit python3 cudaPackages pythonDrvs; inherit (deps) tensorrt-llm tensorrt-src; }; + deps.minimal-torch = pkgs.callPackage ./nix/torch.nix { + python3 = deps.python3-with-nixpkgs; + # todo: match/modify config.cognix.cudaPackages + cudaPackages = (pkgs.extend (self: super: { + config = super.config // { + cudaCapabilities = map toCudaCapability cfg.architectures; + }; + })).cudaPackages_12_1; + }; } diff --git a/flake.lock b/flake.lock index 644260d..21ce584 100644 --- a/flake.lock +++ b/flake.lock @@ -12,16 +12,16 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1721227860, - "narHash": "sha256-Ufbkk0FaMViyFoRogCq+5iEWs8pCGR0Xkc+v7i83Uw0=", + "lastModified": 1721813236, + "narHash": "sha256-QeN5B6hTxLXUAa88xSLIr2D+MW4/ryszZWwbCj86/ek=", "owner": "datakami", "repo": "cognix", - "rev": "47db7ff3e82bc73bd067c89997bb8006da08a148", + "rev": "3a7c66a1e93b9badcaf4eb3e0f497b4597289c1e", "type": "github" }, "original": { "owner": "datakami", - "ref": "24.03", + "ref": "24.07", "repo": "cognix", "type": "github" } @@ -33,15 +33,15 @@ "pyproject-nix": "pyproject-nix" }, "locked": { - "lastModified": 1710167744, - "narHash": "sha256-z78iB1ckRQuJluM82iCuQNjN5hqsNpd1om0q75ncza4=", - "owner": "yorickvp", + "lastModified": 1719513340, + "narHash": "sha256-on3zRua52KZ8G5kBOXMQOzrsA07ywVMNdcIWJEeotfo=", + "owner": "nix-community", "repo": "dream2nix", - "rev": "3bfbbbb19471b60cf1bb7f7c476588a36ac3fb04", + "rev": "4d441820e0d0916c97d7af6c4d4f6843d676e242", "type": "github" }, "original": { - "owner": "yorickvp", + "owner": "nix-community", "repo": "dream2nix", "type": "github" } @@ -84,11 +84,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709780214, - "narHash": "sha256-p4iDKdveHMhfGAlpxmkCtfQO3WRzmlD11aIcThwPqhk=", + "lastModified": 1719436386, + "narHash": "sha256-NBGYaic5FLRg8AWSj6yr4g2IlMPUxNCVjRK6+RNuQBc=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "f945939fd679284d736112d3d5410eb867f3b31c", + "rev": "c66e984bda09e7230ea7b364e677c5ba4f0d36d0", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 6743fca..c62603a 100644 --- a/flake.nix +++ b/flake.nix @@ -4,7 +4,7 @@ extra-substituters = "https://storage.googleapis.com/replicate-nix-cache-dev/"; }; inputs = { - cognix.url = "github:datakami/cognix/24.03"; + cognix.url = "github:datakami/cognix/24.07"; }; outputs = { self, cognix }@inputs: (cognix.lib.cognixFlake inputs {}) // { @@ -26,23 +26,35 @@ cog-triton.architectures = architectures; # don't need this file in a runner - python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter '' + python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter '' rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource* ''; }); - makeBuilder = name: callCognix ( { config, lib, ... }: { + makeBuilder = name: callCognix ( { config, lib, pkgs, ... }: { inherit name; # only grab deps of tensorrt-llm, omegaconf, hf-transfer - cognix.python_root_packages = [ "tensorrt-llm" "omegaconf" "hf-transfer" ]; + cognix.python_root_packages = [ "omegaconf" "hf-transfer" "transformers" "torch" ]; + + cog-triton.architectures = [ "80-real" "86-real" "90-real" ]; # override cog.yaml: cog.concurrency.max = lib.mkForce 1; cognix.rootPath = lib.mkForce "${./cog-trt-llm}"; # this just needs the examples/ dir cognix.environment.TRTLLM_DIR = config.deps.tensorrt-llm.examples; + # HACK: cog needs pydantic v1, but trt-llm needs pydantic v2 + cognix.environment.TRTLLM_PYTHON = config.deps.trtllm-env.config.public.pyEnv; }); in { cog-triton-builder = makeBuilder "cog-triton-builder"; + # we want to push the model to triton-builder-h100 as well + # as cog-triton-builder, but replicate doesn't let us. + # so let's add some data to fool it + cog-triton-builder-h100 = ((makeBuilder "cog-triton-builder-h100").extendModules { + modules = [{ + cognix.environment.TRTLLM_BUILDER_VARIANT = "h100"; + }]; + }).config.public; cog-triton-runner-80 = makeRunner "cog-triton-runner-80" ["80-real"] {}; cog-triton-runner-86 = makeRunner "cog-triton-runner-86" ["86-real"] {}; cog-triton-runner-90 = makeRunner "cog-triton-runner-90" ["90-real"] {}; diff --git a/interface.nix b/interface.nix index da2f8a5..01cc000 100644 --- a/interface.nix +++ b/interface.nix @@ -9,5 +9,7 @@ # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ # 80: A100, 86: A5000, A40, A800, 89: L40, 90: H100 }; + torchSourceBuild = mkEnableOption "Build Torch from source to be smaller"; + trtllmSourceBuild = mkEnableOption "Build trtllm python from source to be smaller"; }; } diff --git a/lock.json b/lock.json index 7207c4c..2c346a8 100644 --- a/lock.json +++ b/lock.json @@ -2,16 +2,22 @@ "fetchPipMetadata": { "sources": { "accelerate": { - "sha256": "c7bb817eb974bba0ff3ea1ba0f24d55afb86d50e3d4fe98d6922dc69cf2ccff1", + "sha256": "0a7f33d60ba09afabd028d4f0856dd19c5a734b7a596d637d9dd6e3d0eadbaf3", "type": "url", - "url": "https://files.pythonhosted.org/packages/f7/fc/c55e5a2da345c9a24aa2e1e0f60eb2ca290b6a41be82da03a6d4baec4f99/accelerate-0.25.0-py3-none-any.whl", - "version": "0.25.0" + "url": "https://files.pythonhosted.org/packages/15/33/b6b4ad5efa8b9f4275d4ed17ff8a44c97276171341ba565fdffb0e3dc5e8/accelerate-0.33.0-py3-none-any.whl", + "version": "0.33.0" + }, + "aiohappyeyeballs": { + "sha256": "4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03", + "type": "url", + "url": "https://files.pythonhosted.org/packages/8b/b4/0983e94060405eb51f23be493e3f5c28003f7ebc5efcd0803c1cb23ea407/aiohappyeyeballs-2.3.5-py3-none-any.whl", + "version": "2.3.5" }, "aiohttp": { - "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72", + "sha256": "b9db600a86414a9a653e3c1c7f6a2f6a1894ab8f83d11505247bd1b90ad57157", "type": "url", - "url": "https://files.pythonhosted.org/packages/a0/09/e7637f4f0760cad4d67347bbd8311c6ad0259a3fc01f04555af9e84bd378/aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "3.9.5" + "url": "https://files.pythonhosted.org/packages/79/ac/0319ee00dcc4ab36856d85a2185721f29806163212fb9e1745c836830aea/aiohttp-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "3.10.1" }, "aiosignal": { "sha256": "f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17", @@ -19,6 +25,12 @@ "url": "https://files.pythonhosted.org/packages/76/ac/a7305707cb852b7e16ff80eaf5692309bde30e2b1100a1fcacdc8f731d97/aiosignal-1.3.1-py3-none-any.whl", "version": "1.3.1" }, + "annotated-types": { + "sha256": "1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", + "type": "url", + "url": "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", + "version": "0.7.0" + }, "antlr4-python3-runtime": { "sha256": "f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", "type": "url", @@ -73,11 +85,17 @@ "url": "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl", "version": "8.1.7" }, + "cloudpickle": { + "sha256": "246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7", + "type": "url", + "url": "https://files.pythonhosted.org/packages/96/43/dae06432d0c4b1dc9e9149ad37b4ca8384cf6eb7700cd9215b177b914f0a/cloudpickle-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "cog": { - "sha256": "0f658f2da28e37da8040d073af4f4e7a91b567a8d169f077d5afddc33793a62f", + "sha256": "abf55ed3309735b2a4fc37f51ac86ab113dcefd8eb4296c0edd5980e02efe463", "type": "url", - "url": "https://files.pythonhosted.org/packages/77/2e/440a1d358a45242b6cbabdfbd59e2f51c4106cbcc6b235b5930077929896/cog-0.10.0a16-py3-none-any.whl", - "version": "0.10.0a16" + "url": "https://files.pythonhosted.org/packages/68/8d/ca4854035294ea02a0a9ffcc11827f315953b5aa1754d68fd23b3753013a/cog-0.10.0a18-py3-none-any.whl", + "version": "0.10.0a18" }, "colored": { "sha256": "a7069673bd90a35f46cb748d012c17284a0668d2f1c06bc7a51822a2d5ad2112", @@ -92,28 +110,28 @@ "version": "15.0.1" }, "cuda-python": { - "sha256": "f087acc19ac4b467d71cfb7a39306038993176a7a1459426da50afa0fe68c697", + "sha256": "e177f584094d9c9fd9c7d153168486a3966765c79cb2a80e86feb15e3b5adc14", "type": "url", - "url": "https://files.pythonhosted.org/packages/70/d1/2e4ae2207f200b75ecfecf025517597ea00899759ef1cb5fb27e99641234/cuda_python-12.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "12.5.0" + "url": "https://files.pythonhosted.org/packages/86/93/f00a5f48eb67216d8a8818b93c0e8bbe5949f297add3367522081ec5223c/cuda_python-12.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "12.6.0" }, "datasets": { - "sha256": "29336bd316a7d827ccd4da2236596279b20ca2ac78f64c04c9483da7cbc2459b", + "sha256": "76ac02e3bdfff824492e20678f0b6b1b6d080515957fe834b00c2ba8d6b18e5e", "type": "url", - "url": "https://files.pythonhosted.org/packages/66/f8/38298237d18d4b6a8ee5dfe390e97bed5adb8e01ec6f9680c0ddf3066728/datasets-2.14.4-py3-none-any.whl", - "version": "2.14.4" + "url": "https://files.pythonhosted.org/packages/60/2d/963b266bb8f88492d5ab4232d74292af8beb5b6fdae97902df9e284d4c32/datasets-2.20.0-py3-none-any.whl", + "version": "2.20.0" }, "diffusers": { - "sha256": "ca258d8141a9faa85b3ce60805fc4898c91d0e73fd5b1576413dfe3b8502a8ec", + "sha256": "114194eb61498aff06243ade750fca6fbc179ca9df68923bb175b70030bed495", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/43/d4ae69ba5f503d58c7aef13f0f93d9c84694652dc2a16f8ea3d8246ebe95/diffusers-0.15.0-py3-none-any.whl", - "version": "0.15.0" + "url": "https://files.pythonhosted.org/packages/74/2b/69bb842f7567cd92a540f8a9a63a20e09304ad8ff84530f26762e7e19626/diffusers-0.30.0-py3-none-any.whl", + "version": "0.30.0" }, "dill": { - "sha256": "76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e", + "sha256": "c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", "type": "url", - "url": "https://files.pythonhosted.org/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl", - "version": "0.3.7" + "url": "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", + "version": "0.3.8" }, "evaluate": { "sha256": "5fdcaf8a086b075c2b8e2c5898f501224b020b0ac7d07be76536e47e661c0c65", @@ -122,10 +140,10 @@ "version": "0.4.2" }, "exceptiongroup": { - "sha256": "5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad", + "sha256": "3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", "type": "url", - "url": "https://files.pythonhosted.org/packages/01/90/79fe92dd413a9cab314ef5c591b5aa9b9ba787ae4cadab75055b0ae00b33/exceptiongroup-1.2.1-py3-none-any.whl", - "version": "1.2.1" + "url": "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", + "version": "1.2.2" }, "fastapi": { "sha256": "f4165fb1fe3610c52cb1b8282c1480de9c34bc270f56a965aa93a884c350d605", @@ -139,12 +157,6 @@ "url": "https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", "version": "3.15.4" }, - "flatbuffers": { - "sha256": "8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812", - "type": "url", - "url": "https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl", - "version": "24.3.25" - }, "frozenlist": { "sha256": "a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a", "type": "url", @@ -152,10 +164,10 @@ "version": "1.4.1" }, "fsspec": { - "sha256": "3cb443f8bcd2efb31295a5b9fdb02aee81d8452c80d28f97a6d0959e6cee101e", + "sha256": "e0fdbc446d67e182f49a70b82cf7889028a63588fde6b222521f10937b2b670c", "type": "url", - "url": "https://files.pythonhosted.org/packages/5e/44/73bea497ac69bafde2ee4269292fa3b41f1198f4bb7bbaaabde30ad29d4a/fsspec-2024.6.1-py3-none-any.whl", - "version": "2024.6.1" + "url": "https://files.pythonhosted.org/packages/ba/a3/16e9fe32187e9c8bc7f9b7bcd9728529faa725231a0c96f2f98714ff2fc5/fsspec-2024.5.0-py3-none-any.whl", + "version": "2024.5.0" }, "gevent": { "sha256": "ca80b121bbec76d7794fcb45e65a7eca660a76cc1a104ed439cdbd7df5f0b060", @@ -176,10 +188,10 @@ "version": "3.0.3" }, "grpcio": { - "sha256": "e7cd5c1325f6808b8ae31657d281aadb2a51ac11ab081ae335f4f7fc44c1721d", + "sha256": "5764237d751d3031a36fafd57eb7d36fd2c10c658d2b4057c516ccf114849a3e", "type": "url", - "url": "https://files.pythonhosted.org/packages/5e/3b/459a477de3d899ffd4164d116a0a1db67468465ef5eaa81652f9319c27ab/grpcio-1.64.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.64.1" + "url": "https://files.pythonhosted.org/packages/a5/57/f03b02c4fad8b72539ab04b8b524782e071c89a2d9c182d60b5d9ded41d7/grpcio-1.65.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.65.4" }, "h11": { "sha256": "e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", @@ -200,10 +212,10 @@ "version": "3.10.0" }, "hf-transfer": { - "sha256": "2f42b89735f1cde22f2a795d1f0915741023235666be7de45879e533c7d6010c", + "sha256": "f865c33ada5bd3650c2b46e59979f2d7755c3f517f8d0facc78576a0c7d26406", "type": "url", - "url": "https://files.pythonhosted.org/packages/ce/00/a3afdb1fee4a9c28228f9962ab2ae3f3fc74380fff195022d76818e9fdac/hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.1.6" + "url": "https://files.pythonhosted.org/packages/5e/89/863f333b49603cc8d3c8862a428cc8fbaa9388ac8f076e9fa5ef3e729c3c/hf_transfer-0.1.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.1.8" }, "hpack": { "sha256": "84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c", @@ -230,10 +242,10 @@ "version": "0.27.0" }, "huggingface-hub": { - "sha256": "3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037", + "sha256": "d93fb63b1f1a919a22ce91a14518974e81fc4610bf344dfe7572343ce8d3aced", "type": "url", - "url": "https://files.pythonhosted.org/packages/69/d6/73f9d1b7c4da5f0544bc17680d0fa9932445423b90cd38e1ee77d001a4f5/huggingface_hub-0.23.4-py3-none-any.whl", - "version": "0.23.4" + "url": "https://files.pythonhosted.org/packages/0b/05/31b21998f68c31e7ffcc27ff08531fb9af5506d765ce8d661fb0036e6918/huggingface_hub-0.24.5-py3-none-any.whl", + "version": "0.24.5" }, "humanfriendly": { "sha256": "1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", @@ -254,10 +266,10 @@ "version": "3.7" }, "importlib-metadata": { - "sha256": "15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f", + "sha256": "11901fa0c2f97919b288679932bb64febaeacf289d18ac84dd68cb2e74213369", "type": "url", - "url": "https://files.pythonhosted.org/packages/dc/ef/38766b2edb096260d9b1b6ad35adaa0bce3b0567abb452b21eb074af88c4/importlib_metadata-8.0.0-py3-none-any.whl", - "version": "8.0.0" + "url": "https://files.pythonhosted.org/packages/82/47/bb25ec04985d0693da478797c3d8c1092b140f3a53ccb984fbbd38affa5b/importlib_metadata-8.2.0-py3-none-any.whl", + "version": "8.2.0" }, "janus": { "sha256": "2596ea5482711c1ee3ef2df6c290aaf370a13c55a007826e8f7c32d696d1d00a", @@ -277,12 +289,24 @@ "url": "https://files.pythonhosted.org/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl", "version": "1.1.9" }, + "markdown-it-py": { + "sha256": "355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "type": "url", + "url": "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "markupsafe": { "sha256": "2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", "type": "url", "url": "https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "2.1.5" }, + "mdurl": { + "sha256": "84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "type": "url", + "url": "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", + "version": "0.1.2" + }, "mpi4py": { "sha256": "c8fa625e0f92b082ef955bfb52f19fa6691d29273d7d71135d295aa143dee6cb", "type": "url", @@ -302,10 +326,10 @@ "version": "6.0.5" }, "multiprocess": { - "sha256": "7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a", + "sha256": "c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", "type": "url", - "url": "https://files.pythonhosted.org/packages/35/a8/36d8d7b3e46b377800d8dec47891cdf05842d1a2366909ae4a0c89fbc5e6/multiprocess-0.70.15-py310-none-any.whl", - "version": "0.70.15" + "url": "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", + "version": "0.70.16" }, "mypy-extensions": { "sha256": "4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d", @@ -331,12 +355,6 @@ "url": "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "1.26.4" }, - "nvidia-ammo": { - "sha256": "ed6b0aa3748e735923ce3825c0044a130400fcd040a2bb54580e4bcd7ef605d3", - "type": "url", - "url": "https://pypi.nvidia.com/nvidia-ammo/nvidia_ammo-0.7.4-cp310-cp310-linux_x86_64.whl", - "version": "0.7.4" - }, "nvidia-cublas-cu12": { "sha256": "ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728", "type": "url", @@ -362,10 +380,10 @@ "version": "12.1.105" }, "nvidia-cudnn-cu12": { - "sha256": "5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9", + "sha256": "adf4f59ed7a1341103822ed8df6e144f4d47ea8b10d9bf0ea0047ba738fd7b02", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", - "version": "8.9.2.26" + "url": "https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-8.9.7.29-py3-none-manylinux1_x86_64.whl", + "version": "8.9.7.29" }, "nvidia-cufft-cu12": { "sha256": "794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56", @@ -391,17 +409,23 @@ "url": "https://pypi.nvidia.com/nvidia-cusparse-cu12/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", "version": "12.1.0.106" }, + "nvidia-modelopt": { + "sha256": "0a81ab04b2013ebc2f35409a48b3eb774517294b7fc274d7bd33c39f4d8bf508", + "type": "url", + "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.15.1" + }, "nvidia-nccl-cu12": { - "sha256": "802756f02c43c0613dc83f48a76f702462b0f1f618411768748bba9c805fce19", + "sha256": "057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-nccl-cu12/nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl", - "version": "2.19.3" + "url": "https://pypi.nvidia.com/nvidia-nccl-cu12/nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", + "version": "2.20.5" }, "nvidia-nvjitlink-cu12": { - "sha256": "f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212", + "sha256": "562ab97ea2c23164823b2a89cb328d01d45cb99634b8c65fe7cd60d14562bd79", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-nvjitlink-cu12/nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", - "version": "12.5.82" + "url": "https://pypi.nvidia.com/nvidia-nvjitlink-cu12/nvidia_nvjitlink_cu12-12.6.20-py3-none-manylinux2014_x86_64.whl", + "version": "12.6.20" }, "nvidia-nvtx-cu12": { "sha256": "dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5", @@ -410,10 +434,10 @@ "version": "12.1.105" }, "nvidia-pytriton": { - "sha256": "810531f752f7bdc4308b8821056ce2d5a456e6cb62966f2e07f65cff0053e42a", + "sha256": "cd3cdfb704db3a01f857adc97fea77d5413c9f9e89f9b7add91c9d16a0bec7f8", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.2-py3-none-manylinux_2_35_x86_64.whl", - "version": "0.5.2" + "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.8-py3-none-manylinux_2_35_x86_64.whl", + "version": "0.5.8" }, "omegaconf": { "sha256": "7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", @@ -422,28 +446,16 @@ "version": "2.3.0" }, "onnx": { - "sha256": "6251910e554f811fdd070164b0bc76d76b067b95576cb9dad4d52ae64fe014b5", + "sha256": "ec6a425e59291fff430da4a884aa07a1d0cbb5dcd22cc78f6cf4ba5adb9f3367", "type": "url", - "url": "https://files.pythonhosted.org/packages/c6/7e/5031717c0636e6074764a2f61a459a3ecd46c20d8b83a1f1cd2513a76160/onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.16.1" - }, - "onnx-graphsurgeon": { - "sha256": "10c130d6129fdeee02945f8103b5b112e6fd4d9b356e2dd3e80f53e0ebee7b5c", - "type": "url", - "url": "https://pypi.nvidia.com/onnx-graphsurgeon/onnx_graphsurgeon-0.5.2-py2.py3-none-any.whl", - "version": "0.5.2" - }, - "onnxruntime": { - "sha256": "ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889", - "type": "url", - "url": "https://files.pythonhosted.org/packages/7a/cf/6aa8c56fd63f53c2c485921e411269c7b501a2b4e634bd02f226ab2d5d8e/onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.16.3" + "url": "https://files.pythonhosted.org/packages/f5/3d/d28484e5d87d4500db0d3b44836d9cd31d88f1efbe168356dbb1dd4f2571/onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.16.2" }, "optimum": { - "sha256": "1354dd1081179b7c490d135c7f380cee672125e17c0bfef143e616c5b756b1db", + "sha256": "508bc55db3c9434f4e8d5a30c39a46ac63c4cdb45bcc5a641b6c1c77cae88d23", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/6d/6b03ffb8df1ab2b43d461f7cace2af5f20092f0767f53a3e9331df00e8a2/optimum-1.21.1-py3-none-any.whl", - "version": "1.21.1" + "url": "https://files.pythonhosted.org/packages/5d/7a/1cc655edf289cdb533b0ea1d2f382d344248a53ad21eb8e34deb4551684b/optimum-1.17.1-py3-none-any.whl", + "version": "1.17.1" }, "packaging": { "sha256": "5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", @@ -458,10 +470,10 @@ "version": "2.2.2" }, "pillow": { - "sha256": "a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc", + "sha256": "b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf", "type": "url", - "url": "https://files.pythonhosted.org/packages/b5/5b/6651c288b08df3b8c1e2f8c1152201e0b25d240e22ddade0f1e242fc9fa0/pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "10.4.0" + "url": "https://files.pythonhosted.org/packages/b5/a2/7a09695dc636bf8d0a1b63022f58701177b7dc6fad30f6d6bc343e5473a4/pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "10.3.0" }, "polygraphy": { "sha256": "62ae22825efdd3288222e5b1d2d791fe58e87844fcd848bcd1251fbce02ba956", @@ -470,10 +482,10 @@ "version": "0.49.9" }, "protobuf": { - "sha256": "7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d", + "sha256": "3319e073562e2515c6ddc643eb92ce20809f5d8f10fead3332f71c63be6a7040", "type": "url", - "url": "https://files.pythonhosted.org/packages/15/db/7f731524fe0e56c6b2eb57d05b55d3badd80ef7d1f1ed59db191b2fdd8ab/protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", - "version": "4.25.3" + "url": "https://files.pythonhosted.org/packages/ca/6c/cc7ab2fb3a4a7f07f211d8a7bbb76bba633eb09b148296dbd4281e217f95/protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl", + "version": "4.25.4" }, "psutil": { "sha256": "5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd", @@ -482,28 +494,46 @@ "version": "6.0.0" }, "pulp": { - "sha256": "4a19814a5b0a4392d788ac2315263435293579b0583c3469943fe0c6a586f263", + "sha256": "ad6a9b566d8458f4d05f4bfe2cea59e32885dd1da6929a361be579222107987c", "type": "url", - "url": "https://files.pythonhosted.org/packages/09/d7/57e71e11108203039c895643368c0d1a99fe719a6a80184edf240c33d25f/PuLP-2.8.0-py3-none-any.whl", - "version": "2.8.0" + "url": "https://files.pythonhosted.org/packages/64/10/704c18b5960b3f9b10efcc859e11881ad90f1e44008e181d2b10cd305a63/PuLP-2.9.0-py3-none-any.whl", + "version": "2.9.0" }, "pyarrow": { - "sha256": "48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd", + "sha256": "f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", + "type": "url", + "url": "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "17.0.0" + }, + "pyarrow-hotfix": { + "sha256": "dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178", "type": "url", - "url": "https://files.pythonhosted.org/packages/b0/54/eb7fcfc0e1ec6a8404cadd11ac957b3ee4fd0774225cafe3ffe6287861cb/pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", - "version": "16.1.0" + "url": "https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl", + "version": "0.6" }, "pydantic": { - "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e", + "sha256": "73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8", + "type": "url", + "url": "https://files.pythonhosted.org/packages/1f/fa/b7f815b8c9ad021c07f88875b601222ef5e70619391ade4a49234d12d278/pydantic-2.8.2-py3-none-any.whl", + "version": "2.8.2" + }, + "pydantic-core": { + "sha256": "3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98", "type": "url", - "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.10.17" + "url": "https://files.pythonhosted.org/packages/ae/49/8a6fe79d35e2f3bea566d8ea0e4e6f436d4f749d7838c8e8c4c5148ae706/pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2.20.1" + }, + "pygments": { + "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", + "type": "url", + "url": "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", + "version": "2.18.0" }, "pynvml": { - "sha256": "5cce014ac01b098d08f06178f86c37be409b80b2e903a5a03ce15eed60f55e25", + "sha256": "a5fba3ab14febda50d19dbda012ef62ae0aed45b7ccc07af0bc5be79223e450c", "type": "url", - "url": "https://files.pythonhosted.org/packages/5b/9c/adb8070059caaa15d5a572b66bccd95900d8c1b9fa54d6ecea6ae97448d1/pynvml-11.5.0-py3-none-any.whl", - "version": "11.5.0" + "url": "https://files.pythonhosted.org/packages/54/5b/16e50abf152be7f18120f11dfff495014a9eaff7b764626e1656f04ad262/pynvml-11.5.3-py3-none-any.whl", + "version": "11.5.3" }, "pyproject-hooks": { "sha256": "7ceeefe9aec63a1064c18d939bdc3adf2d8aa1988a510afec15151578b232aa2", @@ -524,10 +554,10 @@ "version": "1.0.1" }, "python-rapidjson": { - "sha256": "507595740300e95dded254536558cd56733cc3207e3c2457f19231ad00e78d85", + "sha256": "6cb3ad353ec083a6dcf0552f1fce3c490f92e2fccf9a81eac42835297a8431a1", "type": "url", - "url": "https://files.pythonhosted.org/packages/75/f7/7d79a906618ac106c6fad6704bc6375056308526df834fa867b7d94d6039/python_rapidjson-1.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.18" + "url": "https://files.pythonhosted.org/packages/f7/e4/b2d1dff12eae71c35e59d1379727697fd7a543d1ac027071f3cd486b8a1f/python_rapidjson-1.20-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.20" }, "pytz": { "sha256": "328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319", @@ -536,22 +566,22 @@ "version": "2024.1" }, "pyyaml": { - "sha256": "ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515", + "sha256": "ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", "type": "url", - "url": "https://files.pythonhosted.org/packages/29/61/bf33c6c85c55bc45a29eee3195848ff2d518d84735eb0e2d8cb42e0d285e/PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "6.0.1" + "url": "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "6.0.2" }, "pyzmq": { - "sha256": "7e0113d70b095339e99bb522fe7294f5ae6a7f3b2b8f52f659469a74b5cc7661", + "sha256": "77ce6a332c7e362cb59b63f5edf730e83590d0ab4e59c2aa5bd79419a42e3449", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/ac/18b75626cede66295a27e94d7cfe301d2d35120b200a6a46f205a171a20e/pyzmq-23.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "23.2.1" + "url": "https://files.pythonhosted.org/packages/4a/f2/633999c1dcc7e7c0536ac990390a6a3e49295724dbf450c42ea730daadd9/pyzmq-26.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "26.1.0" }, "regex": { - "sha256": "1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5", + "sha256": "bf7a89eef64b5455835f5ed30254ec19bf41f7541cd94f266ab7cbd463f00c41", "type": "url", - "url": "https://files.pythonhosted.org/packages/07/17/5d92509b4dccacf9767d8607112c19667e15db2428014440bae4356b8aff/regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "2024.5.15" + "url": "https://files.pythonhosted.org/packages/3e/66/04b63f31580026c8b819aed7f171149177d10cfab27477ea8800a2268d50/regex-2024.7.24-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2024.7.24" }, "requests": { "sha256": "70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", @@ -559,11 +589,17 @@ "url": "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", "version": "2.32.3" }, + "rich": { + "sha256": "4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222", + "type": "url", + "url": "https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl", + "version": "13.7.1" + }, "safetensors": { - "sha256": "d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376", + "sha256": "44d464bdc384874601a177375028012a5f177f1505279f9456fea84bbc575c7f", "type": "url", - "url": "https://files.pythonhosted.org/packages/8f/05/969e1a976b84283285181b00028cf73d78434b77a6627fc2a94194cca265/safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.4.3" + "url": "https://files.pythonhosted.org/packages/18/f3/27bf4d7112b194eea2d8401706953080692d37ace1b74b36fcc7234961cd/safetensors-0.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.4.4" }, "scipy": { "sha256": "42470ea0195336df319741e230626b6225a740fd9dce9642ca13e98f667047c0", @@ -578,16 +614,22 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05", + "sha256": "5a03e1860cf56bb6ef48ce186b0e557fdba433237481a9a625176c2831be15d1", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/54/2a8ecfcc9a714a6fbf86559a4b0f50b126a4ac4269ea8134f2c75c3e73de/setuptools-70.2.0-py3-none-any.whl", - "version": "70.2.0" + "url": "https://files.pythonhosted.org/packages/e1/58/e0ef3b9974a04ce9cde2a7a33881ddcb2d68450803745804545cdd8d258f/setuptools-72.1.0-py3-none-any.whl", + "version": "72.1.0" }, "sh": { - "sha256": "e4045b6c732d9ce75d571c79f5ac2234edd9ae4f5fa9d59b09705082bdca18c7", + "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/09/89c28aaf2a49f226fef8587c90c6386bd2cc03a0295bc4ff7fc6ee43c01d/sh-1.14.3.tar.gz", - "version": "1.14.3" + "url": "https://files.pythonhosted.org/packages/15/c2/79f9dea6fc544c0eb79ed5018a38860c52d597c4be66c2cf2029bea5b3fd/sh-2.0.7-py3-none-any.whl", + "version": "2.0.7" + }, + "shellingham": { + "sha256": "7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", + "type": "url", + "url": "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", + "version": "1.5.4" }, "six": { "sha256": "8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", @@ -614,46 +656,52 @@ "version": "0.4.15" }, "structlog": { - "sha256": "983bd49f70725c5e1e3867096c0c09665918936b3db27341b41d294283d7a48a", + "sha256": "597f61e80a91cc0749a9fd2a098ed76715a1c8a01f73e336b746504d1aad7610", "type": "url", - "url": "https://files.pythonhosted.org/packages/8f/63/2eb7d30fe126dbd8a398386f14ab0421bb722515f9f50c35fd4048251285/structlog-24.2.0-py3-none-any.whl", - "version": "24.2.0" + "url": "https://files.pythonhosted.org/packages/bf/65/813fc133609ebcb1299be6a42e5aea99d6344afb35ccb43f67e7daaa3b92/structlog-24.4.0-py3-none-any.whl", + "version": "24.4.0" }, "sympy": { - "sha256": "9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515", + "sha256": "db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", "type": "url", - "url": "https://files.pythonhosted.org/packages/61/53/e18c8c97d0b2724d85c9830477e3ebea3acf1dcdc6deb344d5d9c93a9946/sympy-1.12.1-py3-none-any.whl", - "version": "1.12.1" + "url": "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", + "version": "1.13.1" }, "tensorrt": { - "sha256": "24aea5376cb8440afe2b0a22ee83f9748e586aa27303d4f80091ad48a56552a4", + "sha256": "ac5336eea871b812047f6ec8bbeed9ae343a539ba02ff86513b04bd081c14738", + "type": "url", + "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.2.0.post1.tar.gz", + "version": "10.2.0.post1" + }, + "tensorrt-cu12": { + "sha256": "9663446e2872113d619ad5010766cccc1f023d693cb43c3f8f2496563028badc", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt/tensorrt-9.3.0.post12.dev1.tar.gz", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.2.0.post1.tar.gz", + "version": "10.2.0.post1" }, - "tensorrt-bindings": { - "sha256": "c1619e4a9b23b077717af7635489cd1a12a8b4d97477088fc3c5d3a81e36bf65", + "tensorrt-cu12-bindings": { + "sha256": "3248e7951d1f2fa8884759b19456ab7d08a3f75bd6b8e5d58e5cc18788c02171", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-bindings/tensorrt_bindings-9.3.0.post12.dev1-cp310-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.2.0.post1-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, - "tensorrt-libs": { - "sha256": "ab0b6ee6cd41503273d44892cb92b92c75d046a5e468b73884978f59cca4b8d9", + "tensorrt-cu12-libs": { + "sha256": "a42f7ecb1659fac27cf68996df0984e68018be61bd8bbd95f51619f9c4e9cf31", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-libs/tensorrt_libs-9.3.0.post12.dev1-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.2.0.post1-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.2.0.post1" }, "tensorrt-llm": { - "sha256": "2f60b6f8d0afee5f52a5160a44815b0af3e9cd4c46b53cc7a252377ed6cec670", + "sha256": "2a13e1b42a8e5f30189d9e55d8e7e8abe90db1d395130ce328ab50748037053e", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.9.0-cp310-cp310-linux_x86_64.whl", - "version": "0.9.0" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.12.0.dev2024073000-cp310-cp310-linux_x86_64.whl", + "version": "0.12.0.dev2024073000" }, "tokenizers": { - "sha256": "06a56acdfe6c5d51c03ebfc6838f727fcf231c035b94f2460cca68947f6799dc", + "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", "type": "url", - "url": "https://files.pythonhosted.org/packages/11/f9/8c77a471469ea7d1b52f2a25607385109c954d6444a9b0df19796beba461/tokenizers-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.19.0" + "url": "https://files.pythonhosted.org/packages/40/4f/eb78de4af3b17b589f43a369cbf0c3a7173f25c3d2cd93068852c07689aa/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.19.1" }, "tomli": { "sha256": "939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", @@ -662,34 +710,40 @@ "version": "2.0.1" }, "torch": { - "sha256": "cade4fd6c8ce7d826dbcfabd65f1d53b0ee0a058db8c1809d65bfd6051b55530", + "sha256": "f0deb5d2f932a68ed54625ba140eddbf2af22be978ee19b9b63c986add6425b2", "type": "url", - "url": "https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp310-cp310-linux_x86_64.whl", - "version": "2.2.2+cu121" + "url": "https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp310-cp310-linux_x86_64.whl", + "version": "2.3.1+cu121" }, "tqdm": { - "sha256": "b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644", + "sha256": "90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd", "type": "url", - "url": "https://files.pythonhosted.org/packages/18/eb/fdb7eb9e48b7b02554e1664afd3bd3f117f6b6d6c5881438a0b055554f9b/tqdm-4.66.4-py3-none-any.whl", - "version": "4.66.4" + "url": "https://files.pythonhosted.org/packages/48/5d/acf5905c36149bbaec41ccf7f2b68814647347b72075ac0b1fe3022fdc73/tqdm-4.66.5-py3-none-any.whl", + "version": "4.66.5" }, "transformers": { - "sha256": "92797ec3368ed4476a053529a4039a12ad09167d9e371981dda4afb4bdf590ac", + "sha256": "ea0ff72def71e9f4812d9414d4803b22681b1617aa6f511bd51cfff2b44a6fca", "type": "url", - "url": "https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl", - "version": "4.40.0" + "url": "https://files.pythonhosted.org/packages/62/c0/810e741a6244c0f004be40ccb96486d072f042eabbd4d7e8aa02b81ca1eb/transformers-4.44.0-py3-none-any.whl", + "version": "4.44.0" }, "triton": { - "sha256": "a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", + "sha256": "3c84595cbe5e546b1b290d2a58b1494df5a2ef066dd890655e5b8a8a92205c33", "type": "url", - "url": "https://download.pytorch.org/whl/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "2.2.0" + "url": "https://download.pytorch.org/whl/triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "2.3.1" }, "tritonclient": { - "sha256": "754ab373a45306be0c45afbcde06838179d04561694f6d15e138530153aee581", + "sha256": "7074885798e8a711fedaf94d6dd77f310973fe2ebf29fb2837db350a43e2d6a5", "type": "url", - "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.47.0-py3-none-manylinux1_x86_64.whl", - "version": "2.47.0" + "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.48.0-py3-none-manylinux1_x86_64.whl", + "version": "2.48.0" + }, + "typer": { + "sha256": "070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914", + "type": "url", + "url": "https://files.pythonhosted.org/packages/20/b5/11cf2e34fbb11b937e006286ab5b8cfd334fde1c8fa4dd7f491226931180/typer-0.12.3-py3-none-any.whl", + "version": "0.12.3" }, "typing-extensions": { "sha256": "04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", @@ -698,10 +752,10 @@ "version": "4.12.2" }, "typing-inspect": { - "sha256": "3b98390df4d999a28cf5b35d8b333425af5da2ece8a4ea9e98f71e7591347b4f", + "sha256": "9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/1c/66402db44184904a2f14722d317a4da0b5c8c78acfc3faf74362566635c5/typing_inspect-0.6.0-py3-none-any.whl", - "version": "0.6.0" + "url": "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", + "version": "0.9.0" }, "tzdata": { "sha256": "9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", @@ -716,10 +770,10 @@ "version": "2.2.2" }, "uvicorn": { - "sha256": "cd17daa7f3b9d7a24de3617820e634d0933b69eed8e33a516071174427238c81", + "sha256": "b2d86de274726e9878188fa07576c9ceeff90a839e2b6e25c917fe05f5a6c835", "type": "url", - "url": "https://files.pythonhosted.org/packages/b2/f9/e6f30ba6094733e4f9794fd098ca0543a19b07ac1fa3075d595bf0f1fb60/uvicorn-0.30.1-py3-none-any.whl", - "version": "0.30.1" + "url": "https://files.pythonhosted.org/packages/67/d8/1bcb5e6508d14c6c9912cd964b286f04392298ffb3e4218f4a1292d64e76/uvicorn-0.30.5-py3-none-any.whl", + "version": "0.30.5" }, "uvloop": { "sha256": "5a05128d315e2912791de6088c34136bfcdd0c7cbc1cf85fd6fd1bb321b7c849", @@ -728,10 +782,10 @@ "version": "0.19.0" }, "watchfiles": { - "sha256": "c2444dc7cb9d8cc5ab88ebe792a8d75709d96eeef47f4c8fccb6df7c7bc5be71", + "sha256": "8f48c917ffd36ff9a5212614c2d0d585fa8b064ca7e66206fb5c095015bc8207", "type": "url", - "url": "https://files.pythonhosted.org/packages/3d/ae/e7eddbdca559f14a9a38cf04782a5d715cf350aad498d0862fb02b4ebe10/watchfiles-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.22.0" + "url": "https://files.pythonhosted.org/packages/22/ec/c756c012b174ccf5f2ee32202603e66b33b93a54cf16c69a7440c764d7f9/watchfiles-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.23.0" }, "websockets": { "sha256": "6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480", @@ -740,10 +794,10 @@ "version": "12.0" }, "wheel": { - "sha256": "55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81", + "sha256": "2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f", "type": "url", - "url": "https://files.pythonhosted.org/packages/7d/cd/d7460c9a869b16c3dd4e1e403cce337df165368c71d6af229a74699622ce/wheel-0.43.0-py3-none-any.whl", - "version": "0.43.0" + "url": "https://files.pythonhosted.org/packages/1b/d1/9babe2ccaecff775992753d8686970b1e2755d21c8a63be73aba7a4e7d77/wheel-0.44.0-py3-none-any.whl", + "version": "0.44.0" }, "wrapt": { "sha256": "ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf", @@ -776,10 +830,10 @@ "version": "5.0" }, "zope-interface": { - "sha256": "d22fce0b0f5715cdac082e35a9e735a1752dc8585f005d045abb1a7c20e197f9", + "sha256": "10ebac566dd0cec66f942dc759d46a994a2b3ba7179420f0e2130f88f8a5f400", "type": "url", - "url": "https://files.pythonhosted.org/packages/64/0a/849dc6346aae1929101174b413517b1105e278bd649c856584944b834208/zope.interface-6.4.post2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "6.4.post2" + "url": "https://files.pythonhosted.org/packages/ef/c2/8c38d60a99ff20c4837866362283e5e7e7b63fd2ab62eee35d0055dab7c3/zope.interface-7.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "7.0.1" } }, "targets": { @@ -791,10 +845,11 @@ "psutil", "pyyaml", "safetensors", - "torch", - "transformers" + "torch" ], + "aiohappyeyeballs": [], "aiohttp": [ + "aiohappyeyeballs", "aiosignal", "async-timeout", "attrs", @@ -805,6 +860,7 @@ "aiosignal": [ "frozenlist" ], + "annotated-types": [], "antlr4-python3-runtime": [], "anyio": [ "exceptiongroup", @@ -823,6 +879,7 @@ "certifi": [], "charset-normalizer": [], "click": [], + "cloudpickle": [], "cog": [ "attrs", "fastapi", @@ -842,6 +899,7 @@ "datasets": [ "aiohttp", "dill", + "filelock", "fsspec", "huggingface-hub", "multiprocess", @@ -849,10 +907,10 @@ "packaging", "pandas", "pyarrow", + "pyarrow-hotfix", "pyyaml", "requests", "tqdm", - "transformers", "xxhash" ], "diffusers": [ @@ -863,7 +921,7 @@ "pillow", "regex", "requests", - "transformers" + "safetensors" ], "dill": [], "evaluate": [ @@ -877,7 +935,6 @@ "pandas", "requests", "tqdm", - "transformers", "xxhash" ], "exceptiongroup": [], @@ -886,7 +943,6 @@ "starlette" ], "filelock": [], - "flatbuffers": [], "frozenlist": [], "fsspec": [ "aiohttp" @@ -949,7 +1005,11 @@ "markupsafe" ], "lark": [], + "markdown-it-py": [ + "mdurl" + ], "markupsafe": [], + "mdurl": [], "mpi4py": [], "mpmath": [], "multidict": [], @@ -960,24 +1020,13 @@ "networkx": [], "ninja": [], "numpy": [], - "nvidia-ammo": [ - "networkx", - "ninja", - "numpy", - "onnx", - "onnx-graphsurgeon", - "onnxruntime", - "scipy", - "torch", - "tqdm", - "transformers" - ], "nvidia-cublas-cu12": [], "nvidia-cuda-cupti-cu12": [], "nvidia-cuda-nvrtc-cu12": [], "nvidia-cuda-runtime-cu12": [], "nvidia-cudnn-cu12": [ - "nvidia-cublas-cu12" + "nvidia-cublas-cu12", + "nvidia-cuda-nvrtc-cu12" ], "nvidia-cufft-cu12": [], "nvidia-curand-cu12": [], @@ -989,15 +1038,28 @@ "nvidia-cusparse-cu12": [ "nvidia-nvjitlink-cu12" ], + "nvidia-modelopt": [ + "cloudpickle", + "ninja", + "numpy", + "packaging", + "pydantic", + "rich", + "scipy", + "tqdm" + ], "nvidia-nccl-cu12": [], "nvidia-nvjitlink-cu12": [], "nvidia-nvtx-cu12": [], "nvidia-pytriton": [ + "grpcio", + "importlib-metadata", "numpy", "protobuf", "pyzmq", "sh", "tritonclient", + "typer", "typing-inspect", "wrapt" ], @@ -1009,18 +1071,6 @@ "numpy", "protobuf" ], - "onnx-graphsurgeon": [ - "numpy", - "onnx" - ], - "onnxruntime": [ - "coloredlogs", - "flatbuffers", - "numpy", - "packaging", - "protobuf", - "sympy" - ], "optimum": [ "coloredlogs", "datasets", @@ -1046,9 +1096,16 @@ "pyarrow": [ "numpy" ], + "pyarrow-hotfix": [], "pydantic": [ + "annotated-types", + "pydantic-core", "typing-extensions" ], + "pydantic-core": [ + "typing-extensions" + ], + "pygments": [], "pynvml": [], "pyproject-hooks": [], "python-dateutil": [ @@ -1066,6 +1123,10 @@ "idna", "urllib3" ], + "rich": [ + "markdown-it-py", + "pygments" + ], "safetensors": [], "scipy": [ "numpy" @@ -1073,6 +1134,7 @@ "sentencepiece": [], "setuptools": [], "sh": [], + "shellingham": [], "six": [], "sniffio": [], "starlette": [ @@ -1084,14 +1146,15 @@ "mpmath" ], "tensorrt": [ - "tensorrt-bindings", - "tensorrt-libs" + "tensorrt-cu12" ], - "tensorrt-bindings": [], - "tensorrt-libs": [ - "nvidia-cublas-cu12", - "nvidia-cuda-runtime-cu12", - "nvidia-cudnn-cu12" + "tensorrt-cu12": [ + "tensorrt-cu12-bindings", + "tensorrt-cu12-libs" + ], + "tensorrt-cu12-bindings": [], + "tensorrt-cu12-libs": [ + "nvidia-cuda-runtime-cu12" ], "tensorrt-llm": [ "accelerate", @@ -1106,17 +1169,17 @@ "mpi4py", "mpmath", "numpy", - "nvidia-ammo", - "nvidia-cudnn-cu12", + "nvidia-modelopt", "onnx", "optimum", "pandas", + "pillow", "polygraphy", "psutil", "pulp", + "pydantic", "pynvml", "sentencepiece", - "setuptools", "strenum", "tensorrt", "torch", @@ -1153,10 +1216,13 @@ "huggingface-hub", "numpy", "packaging", + "protobuf", + "pydantic", "pyyaml", "regex", "requests", "safetensors", + "sentencepiece", "tokenizers", "tqdm" ], @@ -1165,7 +1231,6 @@ ], "tritonclient": [ "aiohttp", - "cuda-python", "geventhttpclient", "grpcio", "numpy", @@ -1174,6 +1239,12 @@ "python-rapidjson", "urllib3" ], + "typer": [ + "click", + "rich", + "shellingham", + "typing-extensions" + ], "typing-extensions": [], "typing-inspect": [ "mypy-extensions", @@ -1214,5 +1285,5 @@ } } }, - "invalidationHash": "aea5c24536de46921b0505e9f29e379558d83bbd76f08cf2f49f8ffe84243032" + "invalidationHash": "e7e598f1bfb380172adc5cd02e60cdb3bf715610c02bf2b9dfe84e8a53754f0f" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 20f901d..9764c64 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -14,17 +14,20 @@ pybind11-stubgen ? null, withPython ? true, rsync, + zstd, + autoPatchelfHook, + patchelfUnstable, }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.9.0"; + version = "0.12.0.dev2024073000"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; - rev = "v${o.version}"; + rev = "a681853d3803ee5893307e812530b5e7004bb6e1"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-BGU56yI6yuTGHYhq5I3xYhrsKI8O4ykhDFeRP/JGCRo="; + hash = "sha256-Uvx8+Lhuo8lT4TqKjYSL0Mt/QI8jS5T9kxdsNGKJZzU="; }; outputs = if withPython then @@ -37,10 +40,14 @@ stdenv.mkDerivation (o: { [ "out" ]; setSourceRoot = "sourceRoot=$(echo */cpp)"; nativeBuildInputs = [ + patchelfUnstable + zstd cmake ninja python3 cudaPackages.cuda_nvcc + rsync + autoPatchelfHook ]; buildInputs = [ @@ -50,52 +57,63 @@ stdenv.mkDerivation (o: { openmpi python3.pkgs.setuptools ] - ++ (lib.optionals (!withPython) [ - # torch hates the split cuda, so only do it without torch - cudaPackages.cuda_cudart - cudaPackages.cuda_nvcc.dev - cudaPackages.cuda_cccl - cudaPackages.libcublas.lib - cudaPackages.libcublas.dev - cudaPackages.libcurand.dev - cudaPackages.cuda_profiler_api + ++ (with cudaPackages; [ + cuda_cudart + cuda_nvcc.dev + cuda_nvrtc.dev + cuda_nvrtc.lib + cuda_nvml_dev.lib + cuda_nvml_dev.dev + cuda_cccl + libcublas.lib + libcublas.dev + libcurand.dev + cuda_profiler_api ]) - ++ (lib.optionals withPython [ - cudaPackages.cudatoolkit + ++ (lib.optionals withPython (with cudaPackages; [ + cuda_nvtx.dev cuda_nvtx.lib + libcusparse.dev libcusparse.lib + libcusolver.dev libcusolver.lib python3.pkgs.pybind11 python3.pkgs.wheel python3.pkgs.pip pybind11-stubgen - ]); + ])); + env.pythonRelaxDeps = "nvidia-cudnn-cu12"; propagatedBuildInputs = lib.optionals withPython ( with pythonDrvs; builtins.map (x: x.public or x) [ - accelerate # ==0.25.0 + accelerate build colored - # concerning statement from trtllm's requirements.txt: - cuda-python # "Do not override the custom version of cuda-python installed in the NGC PyTorch image." - diffusers # ==0.15.0 + cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. + diffusers lark mpi4py numpy - onnx # >=1.12.0 + onnx polygraphy psutil - pynvml # >=11.5.0 - sentencepiece # >=0.1.99 - tensorrt # ==9.2.0.post12.dev5 - tensorrt-bindings # missed transitive dep - tensorrt-libs - torch # <=2.2.0a - nvidia-ammo # ~=0.7.0; platform_machine=="x86_64" - transformers # ==4.36.1 + pynvml + pulp + pandas + h5py + strenum + sentencepiece + tensorrt + torch + nvidia-modelopt + transformers + pillow wheel optimum evaluate janus + mpmath ] ); + autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ]; + # tries to run cutlass's `python setup.py develop` PYTHONUSERBASE = "/tmp/python"; preConfigure = '' @@ -109,58 +127,90 @@ stdenv.mkDerivation (o: { "-DBUILD_PYBIND=${if withPython then "ON" else "OFF"}" # needs BUILD_PYT "-DBUILD_TESTS=OFF" # needs nvonnxparser.h # believe it or not, this is the actual binary distribution channel for tensorrt: - "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-libs.public}/${python3.sitePackages}/tensorrt_libs" + "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-cu12-libs.public}/${python3.sitePackages}/tensorrt_libs" "-DTRT_INCLUDE_DIR=${tensorrt-src}/include" "-DCMAKE_CUDA_ARCHITECTURES=${builtins.concatStringsSep ";" architectures}" # "-DFAST_BUILD=ON" + "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; - postBuild = lib.optionalString withPython '' + # workaround: cuda_nvcc exposes a gcc12 that uses a gcc13 libc + # however, cmake finds the gcc12 libc somehow, which is wrong + postConfigure = '' + sed -i 's#${cudaPackages.cuda_nvcc.stdenv.cc.cc.lib}#${stdenv.cc.cc.lib}#g' build.ninja + ''; + # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once + postPatch = '' + sed -i 's/#include /#include \n#include /' include/tensorrt_llm/common/mpiUtils.h + sed -i 's/#pragma once/#pragma once\n#include /' tensorrt_llm/kernels/lruKernel.h + ''; + # configurePhase = "true"; + # buildPhase = '' + # tar xf ${/home/yorick/datakami/r8/cog-triton-r8/build-dir.tar.zst} + # cd source/cpp/build/ + # runHook postBuild + # ''; + # libtensorrt_llm.so _sometimes_ wants libcudnn, so --add-needed to prevent it from being shrunk out + postBuild = '' + patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib tensorrt_llm/libtensorrt_llm.so + '' + (lib.optionalString withPython '' pushd ../../ chmod -R +w . - mkdir ./libs - cp -r cpp/build/tensorrt_llm/libtensorrt_llm.so ./libs - cp -r cpp/build/tensorrt_llm/thop/libth_common.so ./libs - cp -r cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so* ./libs + mkdir -p ./libs + cp -ar cpp/build/tensorrt_llm/libtensorrt_llm.so ./libs + cp -ar cpp/build/tensorrt_llm/thop/libth_common.so ./libs + cp -ar cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so* ./libs + cp -ar cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/libtensorrt_llm_nvrtc_wrapper.so ./libs + cp -ar cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention.so ./libs + mkdir -p ./bin + cp -r cpp/build/tensorrt_llm/executor_worker/executorWorker ./bin cp -r cpp/build/tensorrt_llm/pybind/bindings.*.so . - python -m pybind11_stubgen -o . bindings - mv bindings libs bindings.*.so tensorrt_llm + + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${cudaPackages.cuda_cudart.stubs}/lib python -m pybind11_stubgen -o . bindings + rm -rf tensorrt_llm/{bin,bindings,libs} + mv bin bindings libs bindings.*.so tensorrt_llm + patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so.10 libnvinfer_plugin_tensorrt_llm.so --add-rpath '$ORIGIN/../libs' ./tensorrt_llm/bin/executorWorker python setup.py bdist_wheel popd - ''; + ''); + # noAuditTmpdir = true; # todo pythonOutputDistHook # Install isn't well-defined, -backend just expects the build directory to exist somewhere. # Since we just copy build outputs, cmake doesn't get a chance to relink with the correct rpath. - # sed the rpath in place manually - # Also, libtensorrt_llm.so _sometimes_ wants libcudnn, so --add-needed to prevent it from being shrunk out installPhase = '' mkdir -p $out - ${rsync}/bin/rsync -a --exclude "tensorrt_llm/kernels" $src/cpp $out/ - chmod -R u+w $out/cpp - mkdir -p $out/cpp/build/tensorrt_llm/plugins + rsync -a --chmod=u+w --include "tensorrt_llm/kernels/" --include "tensorrt_llm/kernels/kvCacheIndex.h" --exclude "tensorrt_llm/kernels/*" $src/cpp $out/ + pushd $src/cpp/tensorrt_llm + find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/tensorrt_llm/ + popd + # rsync -a --chmod=u+w $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ pushd tensorrt_llm - cp ./libtensorrt_llm.so $out/cpp/build/tensorrt_llm/ - patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so - cp ./plugins/libnvinfer_plugin_tensorrt_llm.so* $out/cpp/build/tensorrt_llm/plugins/ - for f in $out/cpp/build/tensorrt_llm/plugins/*.so*; do - if [ ! -L "$f" ]; then - new_path=$(patchelf --print-rpath "$f" | sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#') - patchelf --set-rpath "$new_path" "$f" - fi - done + mkdir -p $out/cpp/build/tensorrt_llm/ + find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/build/tensorrt_llm/ popd '' + (lib.optionalString withPython '' mv ../../dist $dist pushd $dist - python -m pip install ./*.whl --no-index --no-warn-script-location --prefix="$python" --no-cache + python -m pip install ./*.whl --no-index --no-warn-script-location --prefix="$python" --no-cache --no-deps popd ''); - postFixup = lib.optionalString withPython '' + # manually call autoPatchelf so it doesn't cross-link the outputs + dontAutoPatchelf = true; + # move the propagatedBuildInputs to $python + postFixup = (lib.optionalString withPython '' mv $out/nix-support $python/ + autoPatchelf $python + '') + '' + autoPatchelf $out ''; + # imports check, wants nvml + # pushd $python/${python3.sitePackages} + # python -c "import tensorrt_llm.bindings" + # popd passthru.examples = runCommand "trt-examples" {} '' mkdir $out cp -r ${o.src}/examples $out/examples ''; + passthru.pythonModule = python3; }) diff --git a/nix/torch.nix b/nix/torch.nix new file mode 100644 index 0000000..8594c99 --- /dev/null +++ b/nix/torch.nix @@ -0,0 +1,28 @@ +{ python3, magma-cuda-static, cudaPackages }: +(python3.pkgs.torchWithCuda.override { + torchWithCuda = null; # ?!, not used + cudaSupport = true; + inherit cudaPackages; + magma-cuda-static = magma-cuda-static.override { inherit cudaPackages; }; + future = null; + tensorboard = null; + hypothesis = null; + cffi = null; + openai-triton = null; +}).overridePythonAttrs + (o: { + nativeBuildInputs = o.nativeBuildInputs ++ [ python3.pkgs.setuptools ]; + dependencies = o.dependencies ++ [ python3.pkgs.requests ]; + USE_CUDNN = 0; + USE_KINETO = 0; + USE_QNNPACK = 0; + USE_PYTORCH_QNNPACK = 0; + USE_XNNPACK = 0; + INTERN_DISABLE_ONNX = 1; + ONNX_ML = 0; + USE_ITT = 0; + USE_FLASH_ATTENTION = 0; + USE_MEM_EFF_ATTENTION = 0; + USE_FBGEMM = 0; + USE_MKLDNN = 0; + }) diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 4dab9b2..b4d0519 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -19,21 +19,21 @@ let deps.triton_repo_common = fetchFromGitHub { owner = "triton-inference-server"; repo = "common"; - rev = "00b3a71519e32e3bc954e9f0d067e155ef8f1a6c"; - hash = "sha256-KyFicnB0716nIteSNo43RoiDzuVbj17KM4tIbmN6F+s="; + rev = "0f2072bbc2d4e85f68b10cf60c0ed4e4ebfc766b"; + hash = "sha256-7DdJ1zkHrFEImI137Gt/pDKZhBvoQu0lg2ulqA/yLFA="; }; deps.triton_repo_backend = fetchFromGitHub { owner = "triton-inference-server"; repo = "backend"; + # update for tritons after may 28, 2024 rev = "a06e9a1157d6b5b9b34b6d05a07bb84d517f17c9"; hash = "sha256-Ju2zV/jHUuciTs6GbkqcPG8U0y2lkIWSdAsX78DrpV4="; }; - # todo: update with trt-llm 0.9? deps.triton_repo_core = fetchFromGitHub { owner = "triton-inference-server"; repo = "core"; - rev = "5d4a99c285c729a349265ce8dd7a4535e59d29b1"; - hash = "sha256-WP8bwplo98GmNulX+QA+IrQEc2+GMcTjV53K438vX1g="; + rev = "bbcd7816997046821f9d1a22e418acb84ca5364b"; + hash = "sha256-LWLxMvtV0VQYMQQIfztm10xzQreNAoN9zAexf+5ktHo="; }; deps.googletest = fetchFromGitHub { owner = "google"; @@ -43,18 +43,18 @@ let }; inherit (python3) sitePackages; - trt_lib_dir = "${pythonDrvs.tensorrt-libs.public}/${sitePackages}/tensorrt_libs"; + trt_lib_dir = "${pythonDrvs.tensorrt-cu12-libs.public}/${sitePackages}/tensorrt_libs"; # this package wants gcc12 oldGccStdenv = stdenvAdapters.useLibsFrom stdenv gcc12Stdenv; in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.9.0"; + version = "0.12.0.dev2024073000"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; - rev = "v${version}"; - hash = "sha256-aNjVYu7sDrIj/lse/wS3vYaR/vmjtZfxzBWYi3z3KqQ="; + rev = "b25d578a48422db3b2d5bd89b16c235dd85c4300"; + hash = "sha256-UxuMdhkMv89Ozxi4jXioOfR1gf/cYr/bCxt/RG6CdZw="; }; nativeBuildInputs = [ cmake @@ -70,6 +70,8 @@ oldGccStdenv.mkDerivation rec { cudaPackages.cuda_cccl cudaPackages.libcublas.lib cudaPackages.libcublas.dev + cudaPackages.cuda_nvml_dev.lib + cudaPackages.cuda_nvml_dev.dev ]; sourceRoot = "source/inflight_batcher_llm"; cmakeFlags = [ @@ -84,7 +86,7 @@ oldGccStdenv.mkDerivation rec { ]; postInstall = '' mkdir -p $out/backends/tensorrtllm - cp libtriton_*.so triton_tensorrtllm_worker $out/backends/tensorrtllm + cp libtriton_*.so trtllmExecutorWorker $out/backends/tensorrtllm rm -r /build/source/inflight_batcher_llm/build/_deps/repo-core-build rm -r /build/source/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so ''; @@ -94,7 +96,7 @@ oldGccStdenv.mkDerivation rec { --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' patchelf $out/backends/tensorrtllm/libtriton_tensorrtllm_common.so \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' - patchelf $out/backends/tensorrtllm/triton_tensorrtllm_worker \ - --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' + patchelf $out/backends/tensorrtllm/trtllmExecutorWorker \ + --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib:${tensorrt-llm}/cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention:${tensorrt-llm}/cpp/build/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper' ''; } diff --git a/predict.py b/predict.py index 9273485..e363113 100644 --- a/predict.py +++ b/predict.py @@ -371,6 +371,9 @@ async def predict( f"E2104 TritonMalformedEvent: Triton returned malformed event (no output_ids or error key): {event_data}" ) + if token == []: + continue + n_tokens += 1 if n_tokens == 1: first_token_time = time.time() @@ -446,10 +449,13 @@ def _process_args( pad_id = self.pad_id end_id = self.end_id - if top_k < 0: - top_k = 0 - if min_tokens < 0: - min_tokens = 0 + if top_k <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 + top_k = None + + if top_p <= 0.0: + # workaround, unneccesary with with trtllm > 0.10.0 + top_p = None if not seed: seed = int(np.random.randint(0, 100000)) @@ -459,7 +465,11 @@ def _process_args( max_tokens = min(max_tokens, token_budget) min_tokens = min(min_tokens, token_budget) - args = { + if min_tokens <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 + min_tokens = None + + args = {k: v for k, v in { "text_input": prompt, "max_tokens": max_tokens, "min_length": min_tokens, @@ -473,7 +483,7 @@ def _process_args( "random_seed": seed, "pad_id": pad_id, "end_id": end_id, - } + }.items() if v is not None} return args diff --git a/triton_config_generator.py b/triton_config_generator.py index 0a3dc8f..5d904a0 100644 --- a/triton_config_generator.py +++ b/triton_config_generator.py @@ -53,7 +53,7 @@ def get_config_paths(model_config, model): def generate_configs(config): - models = ['preprocessing', 'tensorrt_llm', 'postprocessing', 'ensemble', 'tensorrt_llm_bls'] + models = ['preprocessing', 'tensorrt_llm', 'postprocessing', 'ensemble'] for model in models: if model not in config: @@ -77,4 +77,4 @@ def main(yaml_file): parser.add_argument('yaml_file', help='Path to the YAML configuration file.') args = parser.parse_args() - main(args.yaml_file) \ No newline at end of file + main(args.yaml_file) diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 40d291d..b279740 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -31,12 +31,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] + optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -159,16 +171,16 @@ input [ optional: true }, { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true }, { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true } ] output [ @@ -196,6 +208,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] ensemble_scheduling { @@ -207,6 +224,14 @@ ensemble_scheduling { key: "QUERY" value: "text_input" } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } + input_map { + key: "IMAGE" + value: "image_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -235,6 +260,10 @@ ensemble_scheduling { key: "PAD_ID" value: "pad_id" } + input_map { + key: "PROMPT_EMBEDDING_TABLE" + value: "prompt_embedding_table" + } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" @@ -243,6 +272,14 @@ ensemble_scheduling { key: "INPUT_ID" value: "_INPUT_ID" } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" @@ -267,6 +304,10 @@ ensemble_scheduling { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } + output_map { + key: "OUT_PROMPT_EMBEDDING_TABLE" + value: "out_prompt_embedding_table" + } }, { model_name: "tensorrt_llm" @@ -275,10 +316,18 @@ ensemble_scheduling { key: "input_ids" value: "_INPUT_ID" } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" @@ -353,7 +402,7 @@ ensemble_scheduling { } input_map { key: "prompt_embedding_table" - value: "prompt_embedding_table" + value: "out_prompt_embedding_table" } input_map { key: "prompt_vocab_size" @@ -390,6 +439,10 @@ ensemble_scheduling { output_map { key: "generation_logits" value: "_GENERATION_LOGITS" + }, + output_map { + key: "batch_index" + value: "_BATCH_INDEX" } }, { @@ -419,6 +472,10 @@ ensemble_scheduling { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } + input_map { + key: "BATCH_INDEX" + value: "_BATCH_INDEX" + } output_map { key: "OUTPUT" value: "output_ids" @@ -439,7 +496,10 @@ ensemble_scheduling { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } + output_map { + key: "OUT_BATCH_INDEX" + value: "batch_index" + } } ] } - diff --git a/triton_model_repo/postprocessing/1/model.py b/triton_model_repo/postprocessing/1/model.py index 5d5663b..e9b0e55 100644 --- a/triton_model_repo/postprocessing/1/model.py +++ b/triton_model_repo/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -55,17 +55,35 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + + skip_special_tokens = model_config['parameters'].get( + 'skip_special_tokens') + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens[ + 'string_value'].lower() + if skip_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # if not self.tokenizer.pad_token: + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -124,6 +142,10 @@ def execute(self, requests): generation_logits = pb_utils.get_input_tensor_by_name( request, 'GENERATION_LOGITS') + # Get the batch index + batch_index = pb_utils.get_input_tensor_by_name( + request, 'BATCH_INDEX') + # Reshape Input # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) # tokens_batch = tokens_batch.T @@ -133,16 +155,11 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', tokens_batch ) - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) - outputs = [] outputs.append(output_tensor) @@ -185,6 +202,15 @@ def execute(self, requests): np.array([[[[0.0]]]], dtype=np.float32)) outputs.append(out_generation_logits) + if batch_index: + out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX', + batch_index.as_numpy()) + outputs.append(out_batch_index) + else: + out_batch_index = pb_utils.Tensor( + 'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32)) + outputs.append(out_batch_index) + # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference @@ -207,13 +233,19 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # # Exclude fake ids in multimodal models + # fake_id_len = 0 + # for i in range(seq_len): + # if tokens[i] < self.tokenizer.vocab_size: + # fake_id_len = i + # break + # output = self.tokenizer.decode( + # tokens[fake_id_len:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt index 5e2e37a..a599265 100644 --- a/triton_model_repo/postprocessing/config.pbtxt +++ b/triton_model_repo/postprocessing/config.pbtxt @@ -61,6 +61,12 @@ input [ data_type: TYPE_FP32 dims: [ -1, -1, -1 ] optional: true + }, + { + name: "BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true } ] output [ @@ -88,6 +94,11 @@ output [ name: "OUT_GENERATION_LOGITS" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "OUT_BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] } ] @@ -101,7 +112,7 @@ parameters { parameters { key: "skip_special_tokens" value: { - string_value: "True" + string_value: "${skip_special_tokens}" } } @@ -111,4 +122,3 @@ instance_group [ kind: KIND_CPU } ] - diff --git a/triton_model_repo/preprocessing/1/model.py b/triton_model_repo/preprocessing/1/model.py index a109775..7e8f677 100644 --- a/triton_model_repo/preprocessing/1/model.py +++ b/triton_model_repo/preprocessing/1/model.py @@ -25,10 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json +import os from typing import List import numpy as np +# import tensorrt as trt +# import torch import triton_python_backend_utils as pb_utils +# from torch.utils.dlpack import from_dlpack from transformers import AutoTokenizer, T5Tokenizer @@ -56,11 +60,32 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.add_special_tokens = model_config['parameters'].get( - 'add_special_tokens', - {'string_value': "false"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] + + add_special_tokens = model_config['parameters'].get( + 'add_special_tokens') + visual_model_path = model_config['parameters']['visual_model_path'][ + 'string_value'] + if visual_model_path == "${visual_model_path}" or visual_model_path == "": + visual_model_path = None + + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens['string_value'].lower() + if add_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.add_special_tokens = add_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, legacy=False, @@ -68,17 +93,60 @@ def initialize(self, args): trust_remote_code=True) if isinstance(self.tokenizer, T5Tokenizer): self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - self.tokenizer.pad_token = self.tokenizer.eos_token + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer_end_id = self.tokenizer.encode( self.tokenizer.eos_token, add_special_tokens=False)[0] self.tokenizer_pad_id = self.tokenizer.encode( self.tokenizer.pad_token, add_special_tokens=False)[0] + self.visual_engine = None + self.visual_context = None + self.stream = None + self.vocab_size = None + self.dtype = None + if visual_model_path is not None: + llm_model_path = model_config['parameters']['gpt_model_path'][ + 'string_value'] + llm_model_path = os.path.join(llm_model_path, 'config.json') + + vision_encoder_path = os.path.join(visual_model_path, + 'model.engine') + with open(vision_encoder_path, 'rb') as f: + engine_buffer = f.read() + + self.stream = torch.cuda.Stream() + torch.cuda.set_stream(self.stream) + + trt_logger = trt.Logger(trt.Logger.WARNING) + visual_runtime = trt.Runtime(trt_logger) + if engine_buffer is not None: + self.visual_engine = visual_runtime.deserialize_cuda_engine( + engine_buffer) + self.visual_context = self.visual_engine.create_execution_context() + self.visual_context.set_optimization_profile_async( + 0, self.stream.cuda_stream) + + assert self.visual_engine.get_tensor_dtype( + 'input' + ) == trt.float16 and self.visual_engine.get_tensor_dtype( + 'output' + ) == trt.float16 and self.visual_engine.num_io_tensors == 2, "Please use the model built in examples/multimodal." + + self.stream.synchronize() + + with open(llm_model_path, 'r') as f: + llm_model_config = json.load(f) + self.vocab_size = int( + llm_model_config["pretrained_config"]["vocab_size"]) + # Parse model output configs and convert Triton types to numpy types output_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", - "OUT_END_ID", "OUT_PAD_ID" + "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", + "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_EMBEDDING_TABLE" ] input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] for input_name in input_names: @@ -126,16 +194,33 @@ def execute(self, requests): # Get input tensors query = pb_utils.get_input_tensor_by_name(request, 'QUERY').as_numpy() - batch_dim = query.shape[0] - if batch_dim != 1: + batch_size = query.shape[0] + + decoder_query = pb_utils.get_input_tensor_by_name( + request, 'DECODER_QUERY') + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + + image = pb_utils.get_input_tensor_by_name(request, 'IMAGE') + if image is not None: + image = from_dlpack(image.to_dlpack()).cuda().half() + if self.visual_engine is None: + err_str = "Images cannot be processed without a vision model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue - err_str = "Inflight batching backend expects requests with batch size of 1." - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(err_str))) - continue + if image.shape[0] != batch_size: + err_str = "Query and Image have different batch sizes." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue request_output_len = pb_utils.get_input_tensor_by_name( request, 'REQUEST_OUTPUT_LEN').as_numpy() @@ -160,13 +245,65 @@ def execute(self, requests): if embedding_bias_weights is not None: embedding_bias_weights = embedding_bias_weights.as_numpy() + prompt_embedding_table_tensor = pb_utils.get_input_tensor_by_name( + request, 'PROMPT_EMBEDDING_TABLE') + if prompt_embedding_table_tensor is not None: + prompt_embedding_table = prompt_embedding_table_tensor.as_numpy( + ) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', prompt_embedding_table) + + if image is not None and prompt_embedding_table_tensor is not None: + + err_str = "Image and prompt table cannot be provided simultaneously." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + + visual_output = None + if image is not None: + ok = self.visual_context.set_input_shape('input', image.shape) + if not ok: + err_str = "Image has wrong shape." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.visual_context.set_tensor_address('input', + image.data_ptr()) + + visual_output_shape = self.visual_context.get_tensor_shape( + 'output') + visual_output = torch.empty(tuple(visual_output_shape), + dtype=torch.float16, + device=image.device) + self.visual_context.set_tensor_address( + 'output', visual_output.data_ptr()) + + ok = self.visual_context.execute_async_v3( + self.stream.cuda_stream) + if not ok: + err_str = "Runtime execution failed for vision encoder model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.stream.synchronize() + # Take the end_id from the input tensors # If not specified, use tokenizer to get end_id end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID') if end_id is not None: end_id = end_id.as_numpy() else: - end_id = [[self.tokenizer_end_id]] + end_id = [[self.tokenizer_end_id]] * batch_size # Take the pad_id from the input tensors # If not specified, use tokenizer to get pad_id @@ -174,16 +311,31 @@ def execute(self, requests): if pad_id is not None: pad_id = pad_id.as_numpy() else: - pad_id = [[self.tokenizer_pad_id]] + pad_id = [[self.tokenizer_pad_id]] * batch_size # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) + input_id, request_input_len = self._create_request( + query, visual_output) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query) + else: + decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32) + request_decoder_input_len = 1 * np.ones( + (batch_size, 1), np.int32) + + bad_words = self._to_word_list_format(bad_words_dict, batch_size) + stop_words = self._to_word_list_format(stop_words_dict, batch_size) embedding_bias = self._get_embedding_bias( embedding_bias_words, embedding_bias_weights, - self.embedding_bias_weights_dtype) + self.embedding_bias_weights_dtype, batch_size) + + if image is not None: + prompt_table = np.array(visual_output.cpu()) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', + prompt_table.astype(self.out_prompt_embedding_table_dtype)) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. @@ -192,6 +344,13 @@ def execute(self, requests): request_input_len_tensor = pb_utils.Tensor( 'REQUEST_INPUT_LEN', request_input_len.astype(self.request_input_len_dtype)) + decoder_input_id_tensor = pb_utils.Tensor( + 'DECODER_INPUT_ID', + decoder_input_id.astype(self.decoder_input_id_dtype)) + request_decoder_input_len_tensor = pb_utils.Tensor( + 'REQUEST_DECODER_INPUT_LEN', + request_decoder_input_len.astype( + self.request_decoder_input_len_dtype)) request_output_len_tensor = pb_utils.Tensor( 'REQUEST_OUTPUT_LEN', request_output_len) bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) @@ -204,11 +363,27 @@ def execute(self, requests): pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID', np.array(pad_id, dtype=np.int32)) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor, - embedding_bias_tensor, end_id_tensor, pad_id_tensor - ]) + if prompt_embedding_table_tensor is not None: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor, + prompt_embedding_table_tensor + ]) + else: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor + ]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -222,7 +397,7 @@ def finalize(self): """ print('Cleaning up...') - def _create_request(self, query): + def _create_request(self, query, visual_features): """ query : batch string (2D numpy array) """ @@ -240,6 +415,14 @@ def _create_request(self, query): add_special_tokens=self.add_special_tokens)).astype( int) for s in query ] + if visual_features is not None: + fake_prompt_id = np.arange( + self.vocab_size, self.vocab_size + visual_features.shape[1]) + start_ids = [ + np.concatenate((fake_prompt_id, ids), axis=0) + for ids in start_ids + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) max_len = 0 @@ -254,7 +437,8 @@ def _create_request(self, query): return start_ids, start_lengths - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + def _to_word_list_format(self, word_lists: List[List[str | bytes]], + batch_size): ''' word_lists format: len(word_lists) == batch_size @@ -264,15 +448,10 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if word_lists is None: # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") + return np.empty([batch_size, 2, 0], dtype="int32") flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +460,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence. " - "We then strip the corresponding first token from the stop sequence IDs. " - f"However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggests there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue @@ -312,12 +482,13 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): (1, 0, 2)) def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights, - bias_dtype): + bias_dtype, batch_size): assert self.tokenizer != None, "need to set tokenizer" if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + return np.empty([batch_size, 0], + dtype=self.embedding_bias_weights_dtype) batch_embedding_bias = [] for words, weights in zip(embedding_bias_words, diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt index b0fa8f2..9b0348c 100644 --- a/triton_model_repo/preprocessing/config.pbtxt +++ b/triton_model_repo/preprocessing/config.pbtxt @@ -31,12 +31,24 @@ input [ { name: "QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ 1 ] + optional: true + }, + { + name: "IMAGE" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] + optional: true }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "BAD_WORDS_DICT" @@ -65,14 +77,21 @@ input [ { name: "END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true }, { name: "PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true + }, + { + name: "PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true } ] output [ @@ -86,6 +105,16 @@ output [ data_type: TYPE_INT32 dims: [ 1 ] }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, { name: "BAD_WORDS_IDS" data_type: TYPE_INT32 @@ -109,12 +138,17 @@ output [ { name: "OUT_END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "OUT_PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "OUT_PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] } ] @@ -132,10 +166,23 @@ parameters { } } +parameters { + key: "visual_model_path" + value: { + string_value: "${visual_model_path}" + } +} + +parameters: { + key: "gpt_model_path" + value: { + string_value: "${engine_dir}" + } +} + instance_group [ { count: 64 kind: KIND_CPU } ] - diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 911fdeb..504291a 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -35,6 +35,7 @@ model_transaction_policy { dynamic_batching { preferred_batch_size: [ 64 ] max_queue_delay_microseconds: 100 + default_queue_policy: { max_queue_size: 0 } } input [ @@ -54,6 +55,7 @@ input [ name: "request_output_len" data_type: TYPE_INT32 dims: [ 1 ] + reshape: { shape: [ ] } }, { name: "draft_input_ids" @@ -62,6 +64,20 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, { name: "draft_logits" data_type: TYPE_FP32 @@ -69,6 +85,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +155,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +183,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +204,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -199,12 +257,14 @@ input [ name: "stop" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { name: "streaming" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { @@ -294,6 +354,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] instance_group [ @@ -323,7 +388,13 @@ parameters: { parameters: { key: "gpt_model_path" value: { - string_value: "/src/triton_model_repo/tensorrt_llm/1" + string_value: "/src/triton_model_repo/tensorrt_llm/1/" + } +} +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" } } parameters: { @@ -335,7 +406,13 @@ parameters: { parameters: { key: "max_attention_window_size" value: { - string_value: "${max_attention_window_size}" + string_value: "4096" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" } } parameters: { @@ -347,21 +424,58 @@ parameters: { parameters: { key: "kv_cache_free_gpu_mem_fraction" value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" + string_value: "0.95" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_onboard_blocks" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_onboard_blocks}" } } +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +531,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,4 +542,9 @@ parameters: { string_value: "${medusa_choices}" } } - +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py b/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index aa2a6d5..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - max_tokens: np.ndarray = np.array([]) - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, - "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) - gen_logits = _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not stream, - "streaming is not supported with speculative decoding") - _validate_that( - not gen_logits, - "generation logits are not supported with speculative decoding" - ) - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding") - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - input_lengths: np.ndarray = np.array([]) - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs(cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None): - return cls( - input_ids=(input_ids - if input_ids is not None else other.input_ids), - input_lengths=(input_lengths if input_lengths is not None else - other.input_lengths), - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return (np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) and - np.array_equal(self.generation_logits, o.generation_logits)) - - -class Decoder: - - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode(self, - request: Request, - speculative_decoding=False) -> Generator[Response, None, None]: - preproc_response = self.preprocess(request) - - if speculative_decoding: - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming: - gen_response = self._generate_non_streaming( - preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, - request: Request) -> Generator[GenerationResponse, None, None]: - - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids):seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0)) - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens):], 0) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32)) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = (len(input_ids) >= - len(prompt_input_ids) + output_len) - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0]) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = new_tokens if ( - self._accumulated_tokens is None) else np.concatenate( - (self._accumulated_tokens, new_tokens), axis=2) - sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]], - dtype=np.int32) - return self._postprocess(self._accumulated_tokens, - sequence_lengths, gen_response) - else: - return self._postprocess(gen_response.output_ids, None, - gen_response) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index f0df3b8..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,433 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - - def __init__(self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "REQUEST_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - ] - - self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits" - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, - triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError( - f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response(self, - triton_response, - response_factory: Callable, - name_map=None): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'") - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - } - return self.convert_triton_response(triton_output, PreprocResponse, - name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, - num_draft_tokens, None, True) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors(self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - tensors = [] - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request(request, num_output_tokens, - draft_request, - is_draft_model_request)) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", - draft_request.draft_input_ids)) - if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - tensors.append( - pb_utils.Tensor("draft_logits", - draft_request.draft_logits)) - - return_context_logits = False - return_generation_logits = False - if draft_request is None: - if is_draft_model_request: - return_generation_logits = request.use_draft_logits[ - 0] if request.use_draft_logits is not None else False - else: - return_context_logits = request.return_context_logits[ - 0] if request.return_context_logits is not None else False - return_generation_logits = request.return_generation_logits[ - 0] if request.return_generation_logits is not None else False - - tensors.append( - pb_utils.Tensor("return_context_logits", - np.array([[return_context_logits]]))) - tensors.append( - pb_utils.Tensor("return_generation_logits", - np.array([[return_generation_logits]]))) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - return self.convert_triton_response(triton_output, GenerationResponse, - name_map) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - input_tensors = self._get_postproc_tensors(tokens, sequence_lengths, - gen_response) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", sequence_lengths - if sequence_lengths else gen_response.sequence_length) - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response(text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits) - return response diff --git a/triton_model_repo/tensorrt_llm_bls/1/model.py b/triton_model_repo/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 609e323..0000000 --- a/triton_model_repo/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - - def initialize(self, args): - - # Parse model configs - model_config = json.loads(args['model_config']) - - params = model_config['parameters'] - - accumulate_tokens_str = '' - if 'accumulate_tokens' in params: - accumulate_tokens_str = params['accumulate_tokens']['string_value'] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - 'true', 'yes', '1', 't' - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"][ - "string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params[ - "tensorrt_llm_draft_model_name"]["string_value"] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name) - - def execute(self, requests): - - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - - req = self.decoder.convert_triton_request(request) - req.validate() - speculative_decode = (req.num_draft_tokens is not None - and req.num_draft_tokens[0][0] > 0) - if speculative_decode and (self.draft_llm_model_name is None - or self.draft_llm_model_name == ""): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc())) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index 17989a9..0000000 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm_bls" -backend: "python" -max_batch_size: 64 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "true" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "${tensorrt_llm_model_name}" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "${tensorrt_llm_draft_model_name}" - } -} - -instance_group [ - { - count: 64 - kind : KIND_CPU - } -] - diff --git a/triton_templates/ensemble/config.pbtxt b/triton_templates/ensemble/config.pbtxt index 0e2627b..74dd3ab 100644 --- a/triton_templates/ensemble/config.pbtxt +++ b/triton_templates/ensemble/config.pbtxt @@ -31,12 +31,24 @@ input [ { name: "text_input" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "decoder_text_input" + data_type: TYPE_STRING + dims: [ 1 ] + optional: true + }, + { + name: "image_input" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] + optional: true }, { name: "max_tokens" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "bad_words" @@ -159,16 +171,16 @@ input [ optional: true }, { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true + name: "embedding_bias_words" + data_type: TYPE_STRING + dims: [ -1 ] + optional: true }, { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true + name: "embedding_bias_weights" + data_type: TYPE_FP32 + dims: [ -1 ] + optional: true } ] output [ @@ -196,6 +208,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] ensemble_scheduling { @@ -207,6 +224,14 @@ ensemble_scheduling { key: "QUERY" value: "text_input" } + input_map { + key: "DECODER_QUERY" + value: "decoder_text_input" + } + input_map { + key: "IMAGE" + value: "image_input" + } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" @@ -235,6 +260,10 @@ ensemble_scheduling { key: "PAD_ID" value: "pad_id" } + input_map { + key: "PROMPT_EMBEDDING_TABLE" + value: "prompt_embedding_table" + } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" @@ -243,6 +272,14 @@ ensemble_scheduling { key: "INPUT_ID" value: "_INPUT_ID" } + output_map { + key: "REQUEST_DECODER_INPUT_LEN" + value: "_REQUEST_DECODER_INPUT_LEN" + } + output_map { + key: "DECODER_INPUT_ID" + value: "_DECODER_INPUT_ID" + } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" @@ -267,6 +304,10 @@ ensemble_scheduling { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } + output_map { + key: "OUT_PROMPT_EMBEDDING_TABLE" + value: "out_prompt_embedding_table" + } }, { model_name: "tensorrt_llm" @@ -275,10 +316,18 @@ ensemble_scheduling { key: "input_ids" value: "_INPUT_ID" } + input_map { + key: "decoder_input_ids" + value: "_DECODER_INPUT_ID" + } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } + input_map { + key: "decoder_input_lengths" + value: "_REQUEST_DECODER_INPUT_LEN" + } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" @@ -353,7 +402,7 @@ ensemble_scheduling { } input_map { key: "prompt_embedding_table" - value: "prompt_embedding_table" + value: "out_prompt_embedding_table" } input_map { key: "prompt_vocab_size" @@ -390,6 +439,10 @@ ensemble_scheduling { output_map { key: "generation_logits" value: "_GENERATION_LOGITS" + }, + output_map { + key: "batch_index" + value: "_BATCH_INDEX" } }, { @@ -419,6 +472,10 @@ ensemble_scheduling { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } + input_map { + key: "BATCH_INDEX" + value: "_BATCH_INDEX" + } output_map { key: "OUTPUT" value: "output_ids" @@ -439,6 +496,10 @@ ensemble_scheduling { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } + output_map { + key: "OUT_BATCH_INDEX" + value: "batch_index" + } } ] } diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 5d5663b..e9b0e55 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -55,17 +55,35 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + + skip_special_tokens = model_config['parameters'].get( + 'skip_special_tokens') + if skip_special_tokens is not None: + skip_special_tokens_str = skip_special_tokens[ + 'string_value'].lower() + if skip_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.skip_special_tokens = skip_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default." + ) + self.skip_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default." + ) + self.skip_special_tokens = True + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # if not self.tokenizer.pad_token: + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -124,6 +142,10 @@ def execute(self, requests): generation_logits = pb_utils.get_input_tensor_by_name( request, 'GENERATION_LOGITS') + # Get the batch index + batch_index = pb_utils.get_input_tensor_by_name( + request, 'BATCH_INDEX') + # Reshape Input # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]]) # tokens_batch = tokens_batch.T @@ -133,16 +155,11 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', tokens_batch ) - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) - outputs = [] outputs.append(output_tensor) @@ -185,6 +202,15 @@ def execute(self, requests): np.array([[[[0.0]]]], dtype=np.float32)) outputs.append(out_generation_logits) + if batch_index: + out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX', + batch_index.as_numpy()) + outputs.append(out_batch_index) + else: + out_batch_index = pb_utils.Tensor( + 'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32)) + outputs.append(out_batch_index) + # Create InferenceResponse. You can set an error here in case # there was a problem with handling this inference request. # Below is an example of how you can set errors in inference @@ -207,13 +233,19 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # # Exclude fake ids in multimodal models + # fake_id_len = 0 + # for i in range(seq_len): + # if tokens[i] < self.tokenizer.vocab_size: + # fake_id_len = i + # break + # output = self.tokenizer.decode( + # tokens[fake_id_len:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_templates/postprocessing/config.pbtxt b/triton_templates/postprocessing/config.pbtxt index 67b8b8a..2ebda5e 100644 --- a/triton_templates/postprocessing/config.pbtxt +++ b/triton_templates/postprocessing/config.pbtxt @@ -61,6 +61,12 @@ input [ data_type: TYPE_FP32 dims: [ -1, -1, -1 ] optional: true + }, + { + name: "BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true } ] output [ @@ -88,6 +94,11 @@ output [ name: "OUT_GENERATION_LOGITS" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "OUT_BATCH_INDEX" + data_type: TYPE_INT32 + dims: [ 1 ] } ] @@ -101,7 +112,7 @@ parameters { parameters { key: "skip_special_tokens" value: { - string_value: "True" + string_value: "${skip_special_tokens}" } } diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 0f561f7..7e8f677 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -25,10 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json +import os from typing import List import numpy as np +# import tensorrt as trt +# import torch import triton_python_backend_utils as pb_utils +# from torch.utils.dlpack import from_dlpack from transformers import AutoTokenizer, T5Tokenizer @@ -56,11 +60,32 @@ def initialize(self, args): model_config = json.loads(args['model_config']) tokenizer_dir = model_config['parameters']['tokenizer_dir'][ 'string_value'] - self.add_special_tokens = model_config['parameters'].get( - 'add_special_tokens', - {'string_value': "false"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] + + add_special_tokens = model_config['parameters'].get( + 'add_special_tokens') + visual_model_path = model_config['parameters']['visual_model_path'][ + 'string_value'] + if visual_model_path == "${visual_model_path}" or visual_model_path == "": + visual_model_path = None + + if add_special_tokens is not None: + add_special_tokens_str = add_special_tokens['string_value'].lower() + if add_special_tokens_str in [ + 'true', 'false', '1', '0', 't', 'f', 'y', 'n', 'yes', 'no' + ]: + self.add_special_tokens = add_special_tokens_str in [ + 'true', '1', 't', 'y', 'yes' + ] + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens' correctly (set value is {add_special_tokens['string_value']}). Set it as True by default." + ) + self.add_special_tokens = True + else: + print( + f"[TensorRT-LLM][WARNING] Don't setup 'add_special_tokens'. Set it as True by default." + ) + self.add_special_tokens = True self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, legacy=False, @@ -68,17 +93,60 @@ def initialize(self, args): trust_remote_code=True) if isinstance(self.tokenizer, T5Tokenizer): self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id() - self.tokenizer.pad_token = self.tokenizer.eos_token + + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer_end_id = self.tokenizer.encode( self.tokenizer.eos_token, add_special_tokens=False)[0] self.tokenizer_pad_id = self.tokenizer.encode( self.tokenizer.pad_token, add_special_tokens=False)[0] + self.visual_engine = None + self.visual_context = None + self.stream = None + self.vocab_size = None + self.dtype = None + if visual_model_path is not None: + llm_model_path = model_config['parameters']['gpt_model_path'][ + 'string_value'] + llm_model_path = os.path.join(llm_model_path, 'config.json') + + vision_encoder_path = os.path.join(visual_model_path, + 'model.engine') + with open(vision_encoder_path, 'rb') as f: + engine_buffer = f.read() + + self.stream = torch.cuda.Stream() + torch.cuda.set_stream(self.stream) + + trt_logger = trt.Logger(trt.Logger.WARNING) + visual_runtime = trt.Runtime(trt_logger) + if engine_buffer is not None: + self.visual_engine = visual_runtime.deserialize_cuda_engine( + engine_buffer) + self.visual_context = self.visual_engine.create_execution_context() + self.visual_context.set_optimization_profile_async( + 0, self.stream.cuda_stream) + + assert self.visual_engine.get_tensor_dtype( + 'input' + ) == trt.float16 and self.visual_engine.get_tensor_dtype( + 'output' + ) == trt.float16 and self.visual_engine.num_io_tensors == 2, "Please use the model built in examples/multimodal." + + self.stream.synchronize() + + with open(llm_model_path, 'r') as f: + llm_model_config = json.load(f) + self.vocab_size = int( + llm_model_config["pretrained_config"]["vocab_size"]) + # Parse model output configs and convert Triton types to numpy types output_names = [ - "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", - "OUT_END_ID", "OUT_PAD_ID" + "INPUT_ID", "DECODER_INPUT_ID", "REQUEST_INPUT_LEN", + "REQUEST_DECODER_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS", + "OUT_END_ID", "OUT_PAD_ID", "OUT_PROMPT_EMBEDDING_TABLE" ] input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"] for input_name in input_names: @@ -126,16 +194,33 @@ def execute(self, requests): # Get input tensors query = pb_utils.get_input_tensor_by_name(request, 'QUERY').as_numpy() - batch_dim = query.shape[0] - if batch_dim != 1: + batch_size = query.shape[0] + + decoder_query = pb_utils.get_input_tensor_by_name( + request, 'DECODER_QUERY') + if decoder_query is not None: + decoder_query = decoder_query.as_numpy() + + image = pb_utils.get_input_tensor_by_name(request, 'IMAGE') + if image is not None: + image = from_dlpack(image.to_dlpack()).cuda().half() + if self.visual_engine is None: + err_str = "Images cannot be processed without a vision model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue - err_str = "Inflight batching backend expects requests with batch size of 1." - logger.log_error(err_str) - responses.append( - pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(err_str))) - continue + if image.shape[0] != batch_size: + err_str = "Query and Image have different batch sizes." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue request_output_len = pb_utils.get_input_tensor_by_name( request, 'REQUEST_OUTPUT_LEN').as_numpy() @@ -160,13 +245,65 @@ def execute(self, requests): if embedding_bias_weights is not None: embedding_bias_weights = embedding_bias_weights.as_numpy() + prompt_embedding_table_tensor = pb_utils.get_input_tensor_by_name( + request, 'PROMPT_EMBEDDING_TABLE') + if prompt_embedding_table_tensor is not None: + prompt_embedding_table = prompt_embedding_table_tensor.as_numpy( + ) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', prompt_embedding_table) + + if image is not None and prompt_embedding_table_tensor is not None: + + err_str = "Image and prompt table cannot be provided simultaneously." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + + visual_output = None + if image is not None: + ok = self.visual_context.set_input_shape('input', image.shape) + if not ok: + err_str = "Image has wrong shape." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.visual_context.set_tensor_address('input', + image.data_ptr()) + + visual_output_shape = self.visual_context.get_tensor_shape( + 'output') + visual_output = torch.empty(tuple(visual_output_shape), + dtype=torch.float16, + device=image.device) + self.visual_context.set_tensor_address( + 'output', visual_output.data_ptr()) + + ok = self.visual_context.execute_async_v3( + self.stream.cuda_stream) + if not ok: + err_str = "Runtime execution failed for vision encoder model." + logger.log_error(err_str) + responses.append( + pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(err_str))) + continue + self.stream.synchronize() + # Take the end_id from the input tensors # If not specified, use tokenizer to get end_id end_id = pb_utils.get_input_tensor_by_name(request, 'END_ID') if end_id is not None: end_id = end_id.as_numpy() else: - end_id = [[self.tokenizer_end_id]] + end_id = [[self.tokenizer_end_id]] * batch_size # Take the pad_id from the input tensors # If not specified, use tokenizer to get pad_id @@ -174,16 +311,31 @@ def execute(self, requests): if pad_id is not None: pad_id = pad_id.as_numpy() else: - pad_id = [[self.tokenizer_pad_id]] + pad_id = [[self.tokenizer_pad_id]] * batch_size # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) + input_id, request_input_len = self._create_request( + query, visual_output) + if decoder_query is not None: + decoder_input_id, request_decoder_input_len = self._create_request( + decoder_query) + else: + decoder_input_id = pad_id * np.ones((batch_size, 1), np.int32) + request_decoder_input_len = 1 * np.ones( + (batch_size, 1), np.int32) + + bad_words = self._to_word_list_format(bad_words_dict, batch_size) + stop_words = self._to_word_list_format(stop_words_dict, batch_size) embedding_bias = self._get_embedding_bias( embedding_bias_words, embedding_bias_weights, - self.embedding_bias_weights_dtype) + self.embedding_bias_weights_dtype, batch_size) + + if image is not None: + prompt_table = np.array(visual_output.cpu()) + prompt_embedding_table_tensor = pb_utils.Tensor( + 'OUT_PROMPT_EMBEDDING_TABLE', + prompt_table.astype(self.out_prompt_embedding_table_dtype)) # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. @@ -192,6 +344,13 @@ def execute(self, requests): request_input_len_tensor = pb_utils.Tensor( 'REQUEST_INPUT_LEN', request_input_len.astype(self.request_input_len_dtype)) + decoder_input_id_tensor = pb_utils.Tensor( + 'DECODER_INPUT_ID', + decoder_input_id.astype(self.decoder_input_id_dtype)) + request_decoder_input_len_tensor = pb_utils.Tensor( + 'REQUEST_DECODER_INPUT_LEN', + request_decoder_input_len.astype( + self.request_decoder_input_len_dtype)) request_output_len_tensor = pb_utils.Tensor( 'REQUEST_OUTPUT_LEN', request_output_len) bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words) @@ -204,11 +363,27 @@ def execute(self, requests): pad_id_tensor = pb_utils.Tensor('OUT_PAD_ID', np.array(pad_id, dtype=np.int32)) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor, - request_input_len_tensor, request_output_len_tensor, - embedding_bias_tensor, end_id_tensor, pad_id_tensor - ]) + if prompt_embedding_table_tensor is not None: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor, + prompt_embedding_table_tensor + ]) + else: + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, decoder_input_id_tensor, + bad_words_ids_tensor, stop_words_ids_tensor, + request_input_len_tensor, + request_decoder_input_len_tensor, + request_output_len_tensor, embedding_bias_tensor, + end_id_tensor, pad_id_tensor + ]) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length @@ -222,7 +397,7 @@ def finalize(self): """ print('Cleaning up...') - def _create_request(self, query): + def _create_request(self, query, visual_features): """ query : batch string (2D numpy array) """ @@ -240,6 +415,14 @@ def _create_request(self, query): add_special_tokens=self.add_special_tokens)).astype( int) for s in query ] + if visual_features is not None: + fake_prompt_id = np.arange( + self.vocab_size, self.vocab_size + visual_features.shape[1]) + start_ids = [ + np.concatenate((fake_prompt_id, ids), axis=0) + for ids in start_ids + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) max_len = 0 @@ -254,7 +437,8 @@ def _create_request(self, query): return start_ids, start_lengths - def _to_word_list_format(self, word_lists: List[List[str | bytes]]): + def _to_word_list_format(self, word_lists: List[List[str | bytes]], + batch_size): ''' word_lists format: len(word_lists) == batch_size @@ -264,15 +448,10 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if word_lists is None: # Return an empty array of shape (1,2,0) - return np.empty([1, 2, 0], dtype="int32") + return np.empty([batch_size, 2, 0], dtype="int32") flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +460,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence." - "We then strip the corresponding first token from the stop sequence IDs." - "However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggestions there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue @@ -312,12 +482,13 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): (1, 0, 2)) def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights, - bias_dtype): + bias_dtype, batch_size): assert self.tokenizer != None, "need to set tokenizer" if embedding_bias_words is None or embedding_bias_weights is None: - return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype) + return np.empty([batch_size, 0], + dtype=self.embedding_bias_weights_dtype) batch_embedding_bias = [] for words, weights in zip(embedding_bias_words, diff --git a/triton_templates/preprocessing/config.pbtxt b/triton_templates/preprocessing/config.pbtxt index ca92187..75d49d5 100644 --- a/triton_templates/preprocessing/config.pbtxt +++ b/triton_templates/preprocessing/config.pbtxt @@ -31,12 +31,24 @@ input [ { name: "QUERY" data_type: TYPE_STRING - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "DECODER_QUERY" + data_type: TYPE_STRING + dims: [ 1 ] + optional: true + }, + { + name: "IMAGE" + data_type: TYPE_FP16 + dims: [ 3, 224, 224 ] + optional: true }, { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "BAD_WORDS_DICT" @@ -65,14 +77,21 @@ input [ { name: "END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true }, { name: "PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] optional: true + }, + { + name: "PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: true } ] output [ @@ -86,6 +105,16 @@ output [ data_type: TYPE_INT32 dims: [ 1 ] }, + { + name: "DECODER_INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_DECODER_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, { name: "BAD_WORDS_IDS" data_type: TYPE_INT32 @@ -109,12 +138,17 @@ output [ { name: "OUT_END_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] }, { name: "OUT_PAD_ID" data_type: TYPE_INT32 - dims: [ -1 ] + dims: [ 1 ] + }, + { + name: "OUT_PROMPT_EMBEDDING_TABLE" + data_type: TYPE_FP16 + dims: [ -1, -1 ] } ] @@ -132,6 +166,20 @@ parameters { } } +parameters { + key: "visual_model_path" + value: { + string_value: "${visual_model_path}" + } +} + +parameters: { + key: "gpt_model_path" + value: { + string_value: "${engine_dir}" + } +} + instance_group [ { count: ${preprocessing_instance_count} diff --git a/triton_templates/tensorrt_llm/1/.gitkeep b/triton_templates/tensorrt_llm/1/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 71d2b98..1974161 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -35,6 +35,7 @@ model_transaction_policy { dynamic_batching { preferred_batch_size: [ ${triton_max_batch_size} ] max_queue_delay_microseconds: ${max_queue_delay_microseconds} + default_queue_policy: { max_queue_size: ${max_queue_size} } } input [ @@ -54,6 +55,7 @@ input [ name: "request_output_len" data_type: TYPE_INT32 dims: [ 1 ] + reshape: { shape: [ ] } }, { name: "draft_input_ids" @@ -62,6 +64,20 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "decoder_input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + optional: true + allow_ragged_batch: true + }, + { + name: "decoder_input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + reshape: { shape: [ ] } + }, { name: "draft_logits" data_type: TYPE_FP32 @@ -69,6 +85,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +155,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +183,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +204,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -199,12 +257,14 @@ input [ name: "stop" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { name: "streaming" data_type: TYPE_BOOL dims: [ 1 ] + reshape: { shape: [ ] } optional: true }, { @@ -294,6 +354,11 @@ output [ name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] + }, + { + name: "batch_index" + data_type: TYPE_INT32 + dims: [ 1 ] } ] instance_group [ @@ -326,6 +391,12 @@ parameters: { string_value: "${engine_dir}" } } +parameters: { + key: "encoder_model_path" + value: { + string_value: "${encoder_engine_dir}" + } +} parameters: { key: "max_tokens_in_paged_kv_cache" value: { @@ -338,6 +409,12 @@ parameters: { string_value: "${max_attention_window_size}" } } +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} parameters: { key: "batch_scheduler_policy" value: { @@ -351,17 +428,54 @@ parameters: { } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_host_memory_bytes" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_host_memory_bytes}" } } +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +531,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,3 +542,9 @@ parameters: { string_value: "${medusa_choices}" } } +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/triton_templates/tensorrt_llm_bls/1/lib/decode.py b/triton_templates/tensorrt_llm_bls/1/lib/decode.py deleted file mode 100644 index aa2a6d5..0000000 --- a/triton_templates/tensorrt_llm_bls/1/lib/decode.py +++ /dev/null @@ -1,332 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Generator -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -class RequestValidationError(Exception): - pass - - -def _validate_that(condition: bool, msg: str): - if not condition: - raise RequestValidationError(msg) - - -def _validate_non_empty(data, msg: str): - _validate_that(data is not None and data.size > 0, msg) - - -def _validate_single_gt_0(data, msg: str): - _validate_non_empty(data, msg) - _validate_that(data.flatten()[0] > 0, msg) - - -def _single_value(data: Optional[np.ndarray]): - if data is None: - return None - return data.flatten()[0] - - -@dataclass -class Request: - text_input: np.ndarray = np.array([]) - max_tokens: np.ndarray = np.array([]) - bad_words: Optional[np.ndarray] = None - stop_words: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - top_k: Optional[np.ndarray] = None - top_p: Optional[np.ndarray] = None - temperature: Optional[np.ndarray] = None - length_penalty: Optional[np.ndarray] = None - repetition_penalty: Optional[np.ndarray] = None - min_length: Optional[np.ndarray] = None - return_log_probs: Optional[np.ndarray] = None - prompt_embedding_table: Optional[np.ndarray] = None - prompt_vocab_size: Optional[np.ndarray] = None - embedding_bias_words: Optional[np.ndarray] = None - embedding_bias_weights: Optional[np.ndarray] = None - num_draft_tokens: Optional[np.ndarray] = None - use_draft_logits: Optional[np.ndarray] = None - stream: Optional[np.ndarray] = None - beam_width: Optional[np.ndarray] = None - return_context_logits: Optional[np.ndarray] = None - return_generation_logits: Optional[np.ndarray] = None - random_seed: Optional[np.ndarray] = None - presence_penalty: Optional[np.ndarray] = None - frequency_penalty: Optional[np.ndarray] = None - - def validate(self): - _validate_non_empty(self.text_input, "text_input is required") - _validate_single_gt_0(self.max_tokens, - "max_tokens must be a single value > 0") - - num_draft_tokens = _single_value(self.num_draft_tokens) - stream = _single_value(self.stream) - gen_logits = _single_value(self.return_generation_logits) - context_logits = _single_value(self.return_context_logits) - - if num_draft_tokens: - _validate_that( - not stream, - "streaming is not supported with speculative decoding") - _validate_that( - not gen_logits, - "generation logits are not supported with speculative decoding" - ) - _validate_that( - not context_logits, - "context logits are not supported with speculative decoding") - - -@dataclass -class DraftRequest: - draft_input_ids: Optional[np.ndarray] = None - draft_logits: Optional[np.ndarray] = None - - -@dataclass -class PreprocResponse: - input_ids: np.ndarray = np.array([]) - input_lengths: np.ndarray = np.array([]) - bad_words_list: Optional[np.ndarray] = None - stop_words_list: Optional[np.ndarray] = None - embedding_bias: Optional[np.ndarray] = None - end_id: Optional[np.ndarray] = None - pad_id: Optional[np.ndarray] = None - - @classmethod - def with_new_inputs(cls, - other, - input_ids: Optional[np.ndarray] = None, - input_lengths: Optional[np.ndarray] = None): - return cls( - input_ids=(input_ids - if input_ids is not None else other.input_ids), - input_lengths=(input_lengths if input_lengths is not None else - other.input_lengths), - bad_words_list=other.bad_words_list, - stop_words_list=other.stop_words_list, - end_id=other.end_id, - pad_id=other.pad_id, - ) - - -@dataclass -class GenerationResponse: - output_ids: np.ndarray = np.array([]) - sequence_length: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - -@dataclass -class Response: - text_output: np.ndarray = np.array([]) - cum_log_probs: Optional[np.ndarray] = None - output_log_probs: Optional[np.ndarray] = None - context_logits: Optional[np.ndarray] = None - generation_logits: Optional[np.ndarray] = None - - def __eq__(self, o) -> bool: - """Just for testing""" - if not isinstance(o, Response): - return False - return (np.array_equal(self.text_output, o.text_output) - and np.array_equal(self.cum_log_probs, o.cum_log_probs) - and np.array_equal(self.output_log_probs, o.output_log_probs) - and np.array_equal(self.context_logits, o.context_logits) and - np.array_equal(self.generation_logits, o.generation_logits)) - - -class Decoder: - - def __init__(self, streaming=False, accumulate=False): - self._streaming = streaming - self._accumulate = accumulate - - self._accumulated_tokens = None - - def decode(self, - request: Request, - speculative_decoding=False) -> Generator[Response, None, None]: - preproc_response = self.preprocess(request) - - if speculative_decoding: - for gen_response in self._spec_generate(preproc_response, request): - yield self.postprocess(gen_response) - else: - if not self._streaming: - gen_response = self._generate_non_streaming( - preproc_response, request) - yield self.postprocess(gen_response) - else: - for gen_response in self._generate(preproc_response, request): - yield self.postprocess(gen_response) - - def encountered_stop_words(self, input_ids, stop_words_ids): - for stop_word_ids in stop_words_ids: - if np.array_equal(input_ids[-len(stop_word_ids):], stop_word_ids): - return True - return False - - def _spec_generate( - self, preproc: PreprocResponse, - request: Request) -> Generator[GenerationResponse, None, None]: - - prompt_input_ids: np.ndarray = preproc.input_ids[0] - input_ids: np.ndarray = prompt_input_ids - output_len: int = request.max_tokens[0][0] - last_input_ids: np.ndarray = None - draft_output_ids: np.ndarray = None - draft_logits: np.ndarray = None - - target_response: GenerationResponse = None - - cur_preproc = preproc - - counter = 0 - while True: - counter += 1 - num_draft_tokens = min( - request.num_draft_tokens[0][0], - len(prompt_input_ids) + output_len - len(input_ids) - 1) - - draft_request = None - if num_draft_tokens > 0: - draft_response: GenerationResponse = self._draft_generate_non_streaming( - cur_preproc, request, num_draft_tokens) - seq_len: int = draft_response.sequence_length[0][0] - # [1, beamWidth, outputLength] -> [outputLen] - draft_output_ids = draft_response.output_ids[0][0] - # [1, beamWidth, outputLength, vocabSizePadded] -> [outputLength, vocabSizePadded] - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - if draft_response.generation_logits is not None: - draft_logits = draft_response.generation_logits[0][0] - - input_draft_tokens = draft_output_ids[len(input_ids):seq_len] - draft_request = DraftRequest( - draft_input_ids=np.expand_dims(input_draft_tokens, 0)) - if request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - draft_request.draft_logits = np.expand_dims( - draft_logits[-len(input_draft_tokens):], 0) - else: - draft_request = DraftRequest() - target_response = self._generate_non_streaming( - cur_preproc, request, draft_request) - last_input_ids = input_ids - input_ids = target_response.output_ids[0][0] - cur_preproc = PreprocResponse.with_new_inputs( - cur_preproc, np.expand_dims(input_ids, 0), - np.array([[len(input_ids)]], dtype=np.int32)) - - # Evaluate criteria to stop generation loop. - # If we've hit or exceeded the max output length, should stop - length_stop = (len(input_ids) >= - len(prompt_input_ids) + output_len) - if length_stop: - break - # If draft and target have same outputs, should stop. Normally target should return 1 more token. - # If they are the same length, they should differ at the last token - target_draft_equal = draft_output_ids is not None and np.array_equal( - draft_output_ids, input_ids) - if target_draft_equal: - break - # If tokens no longer change, should stop, means we have hit early stopping - last_current_equal = np.array_equal(last_input_ids, input_ids) - if last_current_equal: - break - # Need to check if stop words was encountered - hit_stop_words = self.encountered_stop_words( - input_ids, preproc.stop_words_list[0]) - if hit_stop_words: - break - - yield target_response - - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - raise NotImplementedError() - - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - raise NotImplementedError() - - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - raise NotImplementedError() - - def postprocess(self, gen_response: GenerationResponse) -> Response: - if self._accumulate and self._streaming: - new_tokens: np.ndarray = gen_response.output_ids - if new_tokens.ndim != 3: - raise Exception("Expected output_ids tensor to have 3 dims.") - if new_tokens.shape[0] != 1: - raise Exception("Expected batch size of 1") - if new_tokens.shape[1] != 1: - raise Exception( - "Accumulation of tokens is only implemented for beam width = 1" - ) - - self._accumulated_tokens = new_tokens if ( - self._accumulated_tokens is None) else np.concatenate( - (self._accumulated_tokens, new_tokens), axis=2) - sequence_lengths = np.array([[self._accumulated_tokens.shape[2]]], - dtype=np.int32) - return self._postprocess(self._accumulated_tokens, - sequence_lengths, gen_response) - else: - return self._postprocess(gen_response.output_ids, None, - gen_response) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - raise NotImplementedError() - - def preprocess(self, request: Request) -> PreprocResponse: - raise NotImplementedError() - - def reset_decoder(self): - self._accumulated_tokens = None diff --git a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py deleted file mode 100644 index f0df3b8..0000000 --- a/triton_templates/tensorrt_llm_bls/1/lib/triton_decoder.py +++ /dev/null @@ -1,433 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from collections.abc import Callable -from typing import Dict, Optional - -import numpy as np -import triton_python_backend_utils as pb_utils -from lib.decode import * -from typing_extensions import override - - -class TritonDecoder(Decoder): - - def __init__(self, - streaming=False, - accumulate=False, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name="tensorrt_llm", - draft_llm_model_name: Optional[str] = None): - super().__init__(streaming=streaming, accumulate=accumulate) - self.preproc_model_name = preproc_model_name - self.postproc_model_name = postproc_model_name - self.llm_model_name = llm_model_name - self.draft_llm_model_name = draft_llm_model_name - - self._preproc_outputs = [ - "INPUT_ID", - "REQUEST_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - "EMBEDDING_BIAS", - "OUT_PAD_ID", - "OUT_END_ID", - ] - - self._llm_outputs = [ - "output_ids", - "sequence_length", - "cum_log_probs", - "output_log_probs", - "context_logits", - "generation_logits", - ] - - self._postproc_outputs = [ - "OUTPUT", - ] - - self.input_names = [ - "text_input", - "max_tokens", - "bad_words", - "stop_words", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_embedding_table", - "prompt_vocab_size", - "embedding_bias_words", - "embedding_bias_weights", - "num_draft_tokens", - "use_draft_logits", - ] - - self.__undo_reshape_whitelist = { - "max_tokens", - "end_id", - "pad_id", - "top_k", - "top_p", - "temperature", - "length_penalty", - "repetition_penalty", - "min_length", - "presence_penalty", - "frequency_penalty", - "random_seed", - "return_log_probs", - "return_context_logits", - "return_generation_logits", - "beam_width", - "stream", - "prompt_vocab_size", - "num_draft_tokens", - "use_draft_logits", - } - - def _exec_triton_request(self, request): - responses = request.exec(decoupled=True) - for r in responses: - if r.has_error(): - raise pb_utils.TritonModelException(r.error().message()) - yield r - - def _exec_triton_request_single(self, request): - responses = request.exec(decoupled=False) - if responses.has_error(): - raise pb_utils.TritonModelException(responses.error().message()) - return responses - - def create_triton_response(self, response: Response): - name_map = { - "text_output": "text_output", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits" - } - tensors = self.create_triton_tensors(response, name_map) - return pb_utils.InferenceResponse(output_tensors=tensors) - - def convert_triton_request(self, triton_request) -> Request: - request = Request() - for triton_name in self.input_names: - tensor = pb_utils.get_input_tensor_by_name(triton_request, - triton_name) - target_name = triton_name - if tensor is None: - continue - if not hasattr(request, target_name): - raise AttributeError( - f"Request has no attribute '{target_name}'") - setattr(request, target_name, tensor.as_numpy()) - return request - - def convert_triton_response(self, - triton_response, - response_factory: Callable, - name_map=None): - response = response_factory() - for tensor in triton_response.output_tensors(): - if tensor is None: - continue - triton_name = tensor.name() - value = tensor.as_numpy() - target_name = triton_name - if name_map and triton_name in name_map: - target_name = name_map[triton_name] - if name_map and not triton_name in name_map: - continue - if target_name is None: - # explicitly ignore this triton input - continue - if not hasattr(response, target_name): - raise AttributeError( - f"response object has not attribute '{target_name}'") - setattr(response, target_name, value) - return response - - def __undo_reshape(self, x, name): - if name in self.__undo_reshape_whitelist and len(x.shape) == 1: - # handle reshapes - return np.expand_dims(x, 0) - else: - return x - - def create_triton_tensors(self, obj, name_map: dict): - tensors = [] - for name, triton_name in name_map.items(): - if triton_name is None: - continue - value = getattr(obj, name) - if value is None: - continue - t = pb_utils.Tensor(triton_name, self.__undo_reshape(value, name)) - tensors.append(t) - return tensors - - @override - def preprocess(self, request: Request) -> PreprocResponse: - input_tensors = self._get_preproc_tensors(request) - triton_req = pb_utils.InferenceRequest( - model_name=self.preproc_model_name, - inputs=input_tensors, - requested_output_names=self._preproc_outputs) - triton_output = self._exec_triton_request_single(triton_req) - return self._get_preproc_response(triton_output) - - def _get_preproc_tensors(self, request: Request): - name_map = { - "text_input": "QUERY", - "max_tokens": "REQUEST_OUTPUT_LEN", - "bad_words": "BAD_WORDS_DICT", - "stop_words": "STOP_WORDS_DICT", - "embedding_bias_words": "EMBEDDING_BIAS_WORDS", - "embedding_bias_weights": "EMBEDDING_BIAS_WEIGHTS", - "pad_id": "PAD_ID", - "end_id": "END_ID", - } - return self.create_triton_tensors(request, name_map) - - def _get_preproc_response(self, triton_output): - name_map = { - "INPUT_ID": "input_ids", - "REQUEST_INPUT_LEN": "input_lengths", - "BAD_WORDS_IDS": "bad_words_list", - "STOP_WORDS_IDS": "stop_words_list", - "EMBEDDING_BIAS": "embedding_bias", - "OUT_PAD_ID": "pad_id", - "OUT_END_ID": "end_id", - } - return self.convert_triton_response(triton_output, PreprocResponse, - name_map) - - @override - def _draft_generate_non_streaming( - self, preproc: PreprocResponse, request: Request, - num_draft_tokens: int) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, - num_draft_tokens, None, True) - triton_req = pb_utils.InferenceRequest( - model_name=self.draft_llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - triton_response = self._exec_triton_request_single(triton_req) - llm_response = self._get_llm_response(triton_response) - return llm_response - - @override - def _generate( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> Generator[GenerationResponse, None, None]: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - for r in self._exec_triton_request(triton_req): - yield self._get_llm_response(r) - - @override - def _generate_non_streaming( - self, - preproc: PreprocResponse, - request: Request, - draft_request: Optional[DraftRequest] = None - ) -> GenerationResponse: - input_tensors = self._get_llm_tensors(preproc, request, None, - draft_request) - triton_req = pb_utils.InferenceRequest( - model_name=self.llm_model_name, - inputs=input_tensors, - requested_output_names=self._llm_outputs) - r = self._exec_triton_request_single(triton_req) - return self._get_llm_response(r) - - def _get_llm_tensors(self, - preproc: PreprocResponse, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - tensors = [] - tensors.extend(self._get_tensors_from_preproc(preproc)) - tensors.extend( - self._get_llm_tensors_from_request(request, num_output_tokens, - draft_request, - is_draft_model_request)) - return tensors - - def _get_tensors_from_preproc(self, preproc: PreprocResponse): - name_map = { - "input_ids": "input_ids", - "input_lengths": "input_lengths", - "bad_words_list": "bad_words_list", - "stop_words_list": "stop_words_list", - "embedding_bias": "embedding_bias", - "pad_id": "pad_id", - "end_id": "end_id", - } - return self.create_triton_tensors(preproc, name_map) - - def _get_llm_tensors_from_request( - self, - request: Request, - num_output_tokens: Optional[int] = None, - draft_request: Optional[DraftRequest] = None, - is_draft_model_request: bool = False): - name_map: Dict[str, Optional[str]] = { - "beam_width": "beam_width", - "top_k": "runtime_top_k", - "top_p": "runtime_top_p", - "length_penalty": "len_penalty", - "repetition_penalty": "repetition_penalty", - "min_length": "min_length", - "presence_penalty": "presence_penalty", - "frequency_penalty": "frequency_penalty", - "random_seed": "random_seed", - "return_log_probs": "return_log_probs", - "stream": "streaming", - "prompt_embedding_table": "prompt_embedding_table", - "prompt_vocab_size": "prompt_vocab_size", - } - tensors = self.create_triton_tensors(request, name_map) - - out_len = request.max_tokens[0][0] if request.max_tokens else None - if num_output_tokens is not None: - out_len = num_output_tokens - elif draft_request: - if draft_request.draft_input_ids is not None: - out_len = len(draft_request.draft_input_ids[0]) + 1 - else: - out_len = 1 - - if out_len is None: - raise Exception("Could not determine request_output_len") - else: - tensors.append( - pb_utils.Tensor("request_output_len", - np.array([[out_len]], dtype=np.int32))) - - if draft_request: - if draft_request.draft_input_ids is not None: - tensors.append( - pb_utils.Tensor("draft_input_ids", - draft_request.draft_input_ids)) - if draft_request.draft_logits is not None and request.use_draft_logits is not None and request.use_draft_logits[ - 0]: - tensors.append( - pb_utils.Tensor("draft_logits", - draft_request.draft_logits)) - - return_context_logits = False - return_generation_logits = False - if draft_request is None: - if is_draft_model_request: - return_generation_logits = request.use_draft_logits[ - 0] if request.use_draft_logits is not None else False - else: - return_context_logits = request.return_context_logits[ - 0] if request.return_context_logits is not None else False - return_generation_logits = request.return_generation_logits[ - 0] if request.return_generation_logits is not None else False - - tensors.append( - pb_utils.Tensor("return_context_logits", - np.array([[return_context_logits]]))) - tensors.append( - pb_utils.Tensor("return_generation_logits", - np.array([[return_generation_logits]]))) - return tensors - - def _get_llm_response(self, triton_output): - name_map = { - "output_ids": "output_ids", - "sequence_length": "sequence_length", - "cum_log_probs": "cum_log_probs", - "output_log_probs": "output_log_probs", - "context_logits": "context_logits", - "generation_logits": "generation_logits", - } - return self.convert_triton_response(triton_output, GenerationResponse, - name_map) - - def _postprocess(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse) -> Response: - input_tensors = self._get_postproc_tensors(tokens, sequence_lengths, - gen_response) - triton_req = pb_utils.InferenceRequest( - model_name=self.postproc_model_name, - inputs=input_tensors, - requested_output_names=self._postproc_outputs) - r = self._exec_triton_request_single(triton_req) - response = self._get_response(r, gen_response) - return response - - def _get_postproc_tensors(self, tokens: np.ndarray, - sequence_lengths: Optional[np.ndarray], - gen_response: GenerationResponse): - tensors = [ - pb_utils.Tensor("TOKENS_BATCH", tokens), - pb_utils.Tensor( - "SEQUENCE_LENGTH", sequence_lengths - if sequence_lengths else gen_response.sequence_length) - ] - return tensors - - def _get_response(self, triton_output, gen_res: GenerationResponse): - tensors = triton_output.output_tensors() - t_map = {} - for named_t in tensors: - name = named_t.name() - t = named_t.as_numpy() - t_map[name] = t - response = Response(text_output=t_map["OUTPUT"], - cum_log_probs=gen_res.cum_log_probs, - output_log_probs=gen_res.output_log_probs, - context_logits=gen_res.context_logits, - generation_logits=gen_res.generation_logits) - return response diff --git a/triton_templates/tensorrt_llm_bls/1/model.py b/triton_templates/tensorrt_llm_bls/1/model.py deleted file mode 100644 index 609e323..0000000 --- a/triton_templates/tensorrt_llm_bls/1/model.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import traceback - -import triton_python_backend_utils as pb_utils -from lib.triton_decoder import TritonDecoder - - -class TritonPythonModel: - - def initialize(self, args): - - # Parse model configs - model_config = json.loads(args['model_config']) - - params = model_config['parameters'] - - accumulate_tokens_str = '' - if 'accumulate_tokens' in params: - accumulate_tokens_str = params['accumulate_tokens']['string_value'] - - self.accumulate_tokens = accumulate_tokens_str.lower() in [ - 'true', 'yes', '1', 't' - ] - - self.decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) - - self.logger = pb_utils.Logger - - self.llm_model_name = "tensorrt_llm" - if "tensorrt_llm_model_name" in params: - self.llm_model_name = params["tensorrt_llm_model_name"][ - "string_value"] - self.draft_llm_model_name = None - if "tensorrt_llm_draft_model_name" in params: - self.draft_llm_model_name = params[ - "tensorrt_llm_draft_model_name"]["string_value"] - - self.decoder = TritonDecoder( - streaming=self.decoupled, - accumulate=self.accumulate_tokens, - preproc_model_name="preprocessing", - postproc_model_name="postprocessing", - llm_model_name=self.llm_model_name, - draft_llm_model_name=self.draft_llm_model_name) - - def execute(self, requests): - - responses = [] - - for request in requests: - if self.decoupled: - response_sender = request.get_response_sender() - try: - - req = self.decoder.convert_triton_request(request) - req.validate() - speculative_decode = (req.num_draft_tokens is not None - and req.num_draft_tokens[0][0] > 0) - if speculative_decode and (self.draft_llm_model_name is None - or self.draft_llm_model_name == ""): - raise Exception( - "cannot perform speculative decoding without draft model" - ) - res_gen = self.decoder.decode( - req, speculative_decoding=speculative_decode) - - for res in res_gen: - triton_response = self.decoder.create_triton_response(res) - if self.decoupled: - response_sender.send(triton_response) - else: - responses.append(triton_response) - - if self.decoupled: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - - except Exception: - self.logger.log_error(traceback.format_exc()) - # If encountering an error, send a response with err msg - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(traceback.format_exc())) - - if self.decoupled: - response_sender.send(error_response) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) - else: - responses.append(error_response) - - self.decoder.reset_decoder() - if self.decoupled: - return None - else: - assert len(responses) == len(requests) - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print('Cleaning up...') diff --git a/triton_templates/tensorrt_llm_bls/config.pbtxt b/triton_templates/tensorrt_llm_bls/config.pbtxt deleted file mode 100644 index e5aff22..0000000 --- a/triton_templates/tensorrt_llm_bls/config.pbtxt +++ /dev/null @@ -1,247 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm_bls" -backend: "python" -max_batch_size: ${triton_max_batch_size} - -model_transaction_policy { - decoupled: ${decoupled_mode} -} - -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "end_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "frequency_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_context_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "return_generation_logits" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "prompt_embedding_table" - data_type: TYPE_FP16 - dims: [ -1, -1 ] - optional: true - }, - { - name: "prompt_vocab_size" - data_type: TYPE_INT32 - dims: [ 1 ] - optional: true - }, - { - name: "embedding_bias_words" - data_type: TYPE_STRING - dims: [ -1 ] - optional: true - }, - { - name: "embedding_bias_weights" - data_type: TYPE_FP32 - dims: [ -1 ] - optional: true - }, - { - name: "num_draft_tokens", - data_type: TYPE_INT32, - dims: [ 1 ] - optional: true - }, - { - name: "use_draft_logits", - data_type: TYPE_BOOL, - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "context_logits" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - }, - { - name: "generation_logits" - data_type: TYPE_FP32 - dims: [ -1, -1, -1 ] - } -] - -parameters: { - key: "accumulate_tokens" - value: { - string_value: "${accumulate_tokens}" - } -} -parameters: { - key: "tensorrt_llm_model_name" - value: { - string_value: "${tensorrt_llm_model_name}" - } -} -parameters: { - key: "tensorrt_llm_draft_model_name" - value: { - string_value: "${tensorrt_llm_draft_model_name}" - } -} - -instance_group [ - { - count: ${bls_instance_count} - kind : KIND_CPU - } -]