From 36db136e7ee44f461302c8a175abbb9eab3bc3b5 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 28 Oct 2025 17:23:08 +0000 Subject: [PATCH 01/32] automodel on latest main Signed-off-by: adil-a --- 3rdparty/Automodel-workspace/Automodel | 2 +- pyproject.toml | 8 +- uv.lock | 165 ++++++++++++++++++++----- 3 files changed, 143 insertions(+), 32 deletions(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index a2db048383..b27761ba52 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit a2db048383cd54b3fafc928df4c30bf7bbf7c430 +Subproject commit b27761ba52422d3835428f0c0e05398e58ae2ae2 diff --git a/pyproject.toml b/pyproject.toml index 77efc42a3d..955f12269b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ automodel = [ "flash-attn==2.8.1", "mamba-ssm", "causal-conv1d", + "grouped_gemm @ git+https://github.com/fanshiqing/grouped_gemm@v1.1.4", ] vllm = [ "cuda-python", @@ -188,7 +189,7 @@ explicit = true [tool.uv] preview = true # Enable preview features like extra-build-dependencies -no-build-isolation-package = ["transformer-engine-torch", "transformer-engine", "flash-attn", "mamba-ssm", "causal-conv1d", "deep_gemm", "deep_ep"] +no-build-isolation-package = ["transformer-engine-torch", "transformer-engine", "flash-attn", "mamba-ssm", "deep_gemm", "deep_ep"] # Always apply the build group since dependencies like TE/mcore/nemo-run require build dependencies # and this lets us assume they are implicitly installed with a simply `uv sync`. Ideally, we'd # avoid including these in the default dependency set, but for now it's required. @@ -199,7 +200,7 @@ default-groups = ["dev", "build"] # --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container) link-mode = "copy" # This override is needed because automodel/mbridge we are on is still on 2.5.0 -override-dependencies = ["transformer-engine[pytorch]==2.8.0"] +override-dependencies = ["transformer-engine[pytorch]==2.8.0", "opencv-python-headless>=4.11.0"] # Augment build dependencies for packages that need torch at build time [tool.uv.extra-build-dependencies] @@ -210,7 +211,8 @@ deep_gemm = [{ requirement = "torch", match-runtime = true }] transformer-engine = [{ requirement = "torch", match-runtime = true }] transformer-engine-torch = [{ requirement = "torch", match-runtime = true }] mamba-ssm = [{ requirement = "torch", match-runtime = true }] -causal-conv1d = [{ requirement = "torch", match-runtime = true }] +causal-conv1d = ["torch", "setuptools"] +grouped-gemm = ["torch"] # Needed when building from source [[tool.uv.dependency-metadata]] diff --git a/uv.lock b/uv.lock index 846f35c3df..24c2d731d8 100644 --- a/uv.lock +++ b/uv.lock @@ -24,7 +24,10 @@ members = [ "nemo-rl", "penguin", ] -overrides = [{ name = "transformer-engine", extras = ["pytorch"], specifier = "==2.8.0" }] +overrides = [ + { name = "opencv-python-headless", specifier = ">=4.11.0" }, + { name = "transformer-engine", extras = ["pytorch"], specifier = "==2.8.0" }, +] [[manifest.dependency-metadata]] name = "causal-conv1d" @@ -348,22 +351,10 @@ version = "15.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/17/89/940a509ee7e9449f0c877fa984b37b7cc485546035cc67bbc353f2ac20f3/av-15.0.0.tar.gz", hash = "sha256:871c1a9becddf00b60b1294dc0bff9ff193ac31286aeec1a34039bd27e650183", size = 3833128, upload-time = "2025-07-03T16:23:48.455Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/89/81/c5d009ea9c01a513b7af6aac2ac49c0f2f7193345071cd6dd4d91bef3ab9/av-15.0.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:84e2ede9459e64e768f4bc56d9df65da9e94b704ee3eccfe2e5b1da1da754313", size = 21782026, upload-time = "2025-07-03T16:22:18.41Z" }, - { url = "https://files.pythonhosted.org/packages/16/8a/ffe9fcac35a07efc6aa0d765015efa499d88823c01499f318760460f8088/av-15.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:9473ed92d6942c5a449a2c79d49f3425eb0272499d1a3559b32c1181ff736a08", size = 26974939, upload-time = "2025-07-03T16:22:21.493Z" }, - { url = "https://files.pythonhosted.org/packages/a0/e7/0816e52134dc2d0259bb1aaad78573eacaf2bebc1a643de34e3384b520d6/av-15.0.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:56a53fe4e09bebd99355eaa0ce221b681eaf205bdda114f5e17fb79f3c3746ad", size = 34573486, upload-time = "2025-07-03T16:22:24.684Z" }, - { url = "https://files.pythonhosted.org/packages/a3/f4/07cc05712e9824a4bb68beea44eb5a7369dee3f00fa258879190004b7fc5/av-15.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:247dd9a99d7ed3577b8c1e9977e811f423b04504ff36c9dcd7a4de3e6e5fe5ad", size = 38418908, upload-time = "2025-07-03T16:22:27.799Z" }, { url = "https://files.pythonhosted.org/packages/19/48/7f3a21a41e291f8c5b8a98f95cfef308ce1b024a634413ce910c270efd7d/av-15.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:fc50a7d5f60109221ccf44f8fa4c56ce73f22948b7f19b1717fcc58f7fbc383e", size = 40010257, upload-time = "2025-07-03T16:22:31.15Z" }, - { url = "https://files.pythonhosted.org/packages/6d/c9/ced392e82d39084544d2d0c05decb36446028928eddf0d40ec3d8fe6c050/av-15.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:77deaec8943abfebd4e262924f2f452d6594cf0bc67d8d98aac0462b476e4182", size = 40381801, upload-time = "2025-07-03T16:22:34.254Z" }, - { url = "https://files.pythonhosted.org/packages/d2/73/a23ad111200e27f5773e94b0b6f9e2ea492a72ded7f4787a358d9d504a8b/av-15.0.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:601d9b0740e47a17ec96ba2a537ebfd4d6edc859ae6f298475c06caa51f0a019", size = 37219417, upload-time = "2025-07-03T16:22:37.497Z" }, { url = "https://files.pythonhosted.org/packages/45/0c/2ac20143b74e3792ede40bfd397ce72fa4e76a03999c2fd0aee3997b6971/av-15.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e021f67e0db7256c9f5d3d6a2a4237a4a4a804b131b33e7f2778981070519b20", size = 41242077, upload-time = "2025-07-03T16:22:40.86Z" }, { url = "https://files.pythonhosted.org/packages/bd/30/40452705dffbfef0f5505d36218970dfeff0a86048689910219c8717b310/av-15.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:383f1b57520d790069d85fc75f43cfa32fca07f5fb3fb842be37bd596638602c", size = 31357617, upload-time = "2025-07-03T16:22:43.934Z" }, - { url = "https://files.pythonhosted.org/packages/a6/27/c2e248498ce78dd504b0b1818ce88e71e30a7e26c348bdf5d6467d7b06f7/av-15.0.0-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:0701c116f32bd9478023f610722f6371d15ca0c068ff228d355f54a7cf23d9cb", size = 21746400, upload-time = "2025-07-03T16:22:46.604Z" }, - { url = "https://files.pythonhosted.org/packages/1d/d8/11f8452f19f4ddc189e978b215420131db40e3919135c14a0d13520f7c94/av-15.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:57fb6232494ec575b8e78e5a9ef9b811d78f8d67324476ec8430ca3146751124", size = 26939576, upload-time = "2025-07-03T16:22:49.255Z" }, - { url = "https://files.pythonhosted.org/packages/00/1c/b109fd41487d91b8843f9e199b65e89ca533a612ec788b11ed0ba9812ea3/av-15.0.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:801a3e0afd5c36df70d012d083bfca67ab22d0ebd2c860c0d9432ac875bc0ad6", size = 34284344, upload-time = "2025-07-03T16:22:52.373Z" }, - { url = "https://files.pythonhosted.org/packages/99/71/aee35fa182d0a41227fbd3f4250fd94c54acdd2995025ee59dd948bba930/av-15.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:d5e97791b96741b344bf6dbea4fb14481c117b1f7fe8113721e8d80e26cbb388", size = 38130346, upload-time = "2025-07-03T16:22:56.755Z" }, { url = "https://files.pythonhosted.org/packages/b7/c4/2d9bbc9c42a804c99bc571eeacb2fe1582fe9cfdb726616876cada937d6a/av-15.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:acb4e4aa6bb394d3a9e60feb4cb7a856fc7bac01f3c99019b1d0f11c898c682c", size = 39728857, upload-time = "2025-07-03T16:23:00.392Z" }, - { url = "https://files.pythonhosted.org/packages/7c/d6/a5746e9fb4fdf326e9897abd7538413210e66f35ad4793fe30f87859249d/av-15.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:02d2d80bdbe184f1f3f49b3f5eae7f0ff7cba0a62ab3b18be0505715e586ad29", size = 40109012, upload-time = "2025-07-03T16:23:04.1Z" }, - { url = "https://files.pythonhosted.org/packages/77/1f/da89798231ad0feacfaaea4efec4f1779060226986f97498eabe2c7c54a8/av-15.0.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:603f3ae751f6678df5d8b949f92c6f8257064bba8b3e8db606a24c29d31b4e25", size = 36929211, upload-time = "2025-07-03T16:23:07.694Z" }, { url = "https://files.pythonhosted.org/packages/d5/4c/2bcabe65a1c19e552f03540f16155a0d02cb9b7a90d31242ab3e0c7ea0d8/av-15.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:682686a9ea2745e63c8878641ec26b1787b9210533f3e945a6e07e24ab788c2e", size = 40967172, upload-time = "2025-07-03T16:23:13.488Z" }, { url = "https://files.pythonhosted.org/packages/c9/f0/fe14adaa670ab7a3f709805a8494fd0a2eeb6a5b18b8c59dc6014639a5b1/av-15.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:5758231163b5486dfbf664036be010b7f5ebb24564aaeb62577464be5ea996e0", size = 31332650, upload-time = "2025-07-03T16:23:16.558Z" }, ] @@ -1086,7 +1077,7 @@ name = "decord" version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" }, @@ -1139,6 +1130,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/ae/afb1487556e2dc827a17097aac8158a25b433a345386f0e249f6d2694ccb/devtools-0.12.2-py3-none-any.whl", hash = "sha256:c366e3de1df4cdd635f1ad8cbcd3af01a384d7abda71900e68d43b04eb6aaca7", size = 19411, upload-time = "2023-09-03T16:56:59.049Z" }, ] +[[package]] +name = "diffusers" +version = "0.35.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "importlib-metadata" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/68/288ca23c7c05c73e87ffe5efffc282400ac9b017f7a9bb03883f4310ea15/diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded", size = 3366711, upload-time = "2025-10-15T04:05:17.213Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/2e/38d9824f8c6bb048c5ba21c6d4da54c29c162a46b58b3ef907a360a76d3e/diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5", size = 4121649, upload-time = "2025-10-15T04:05:14.391Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -1535,6 +1545,18 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + [[package]] name = "gguf" version = "0.17.1" @@ -1798,6 +1820,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/27/3d6dcadc8a3214d8522c1e7f6a19554e33659be44546d44a2f7572ac7d2a/groovy-0.1.2-py3-none-any.whl", hash = "sha256:7f7975bab18c729a257a8b1ae9dcd70b7cafb1720481beae47719af57c35fa64", size = 14090, upload-time = "2025-02-28T20:24:55.152Z" }, ] +[[package]] +name = "grouped-gemm" +version = "1.1.4" +source = { git = "https://github.com/fanshiqing/grouped_gemm?rev=v1.1.4#172fada89fa7364fe5d026b3a0dfab58b591ffdd" } +dependencies = [ + { name = "absl-py" }, + { name = "numpy" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, +] + [[package]] name = "grpcio" version = "1.74.0" @@ -1990,6 +2023,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "imageio-ffmpeg" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/c3343c721f2a1b0c9fc71c1aebf1966a3b7f08c2eea8ed5437a2865611d6/imageio_ffmpeg-0.6.0.tar.gz", hash = "sha256:e2556bed8e005564a9f925bb7afa4002d82770d6b08825078b7697ab88ba1755", size = 25210, upload-time = "2025-01-16T21:34:32.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/58/87ef68ac83f4c7690961bce288fd8e382bc5f1513860fc7f90a9c1c1c6bf/imageio_ffmpeg-0.6.0-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:9d2baaf867088508d4a3458e61eeb30e945c4ad8016025545f66c4b5aaef0a61", size = 24932969, upload-time = "2025-01-16T21:34:20.464Z" }, + { url = "https://files.pythonhosted.org/packages/40/5c/f3d8a657d362cc93b81aab8feda487317da5b5d31c0e1fdfd5e986e55d17/imageio_ffmpeg-0.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b1ae3173414b5fc5f538a726c4e48ea97edc0d2cdc11f103afee655c463fa742", size = 21113891, upload-time = "2025-01-16T21:34:00.277Z" }, + { url = "https://files.pythonhosted.org/packages/33/e7/1925bfbc563c39c1d2e82501d8372734a5c725e53ac3b31b4c2d081e895b/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1d47bebd83d2c5fc770720d211855f208af8a596c82d17730aa51e815cdee6dc", size = 25632706, upload-time = "2025-01-16T21:33:53.475Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2d/43c8522a2038e9d0e7dbdf3a61195ecc31ca576fb1527a528c877e87d973/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c7e46fcec401dd990405049d2e2f475e2b397779df2519b544b8aab515195282", size = 29498237, upload-time = "2025-01-16T21:34:13.726Z" }, + { url = "https://files.pythonhosted.org/packages/a0/13/59da54728351883c3c1d9fca1710ab8eee82c7beba585df8f25ca925f08f/imageio_ffmpeg-0.6.0-py3-none-win32.whl", hash = "sha256:196faa79366b4a82f95c0f4053191d2013f4714a715780f0ad2a68ff37483cc2", size = 19652251, upload-time = "2025-01-16T21:34:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c6/fa760e12a2483469e2bf5058c5faff664acf66cadb4df2ad6205b016a73d/imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02fa47c83703c37df6bfe4896aab339013f62bf02c5ebf2dce6da56af04ffc0a", size = 31246824, upload-time = "2025-01-16T21:34:28.6Z" }, +] + [[package]] name = "imagesize" version = "1.4.1" @@ -3023,8 +3070,12 @@ source = { editable = "3rdparty/Automodel-workspace/Automodel" } dependencies = [ { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "datasets" }, + { name = "diffusers" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "megatron-fsdp" }, + { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, @@ -3036,6 +3087,40 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "backoff" }, + { name = "flash-attn" }, + { name = "mistral-common", extra = ["opencv"] }, + { name = "numba" }, + { name = "numpy" }, + { name = "nvidia-nvshmem-cu13" }, + { name = "pillow" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "timm" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "transformers" }, +] +deepep = [ + { name = "nvidia-nvshmem-cu13" }, +] +dev = [ + { name = "backoff" }, + { name = "flash-attn" }, + { name = "mistral-common", extra = ["opencv"] }, + { name = "numba" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "timm" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "transformers" }, +] fa = [ { name = "flash-attn" }, ] @@ -3048,9 +3133,12 @@ vlm = [ { name = "numba" }, { name = "numpy" }, { name = "pillow" }, - { name = "qwen-vl-utils", extra = ["decord"] }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "timm" }, - { name = "torchcodec" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, + { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, { name = "transformers" }, ] @@ -3087,28 +3175,40 @@ requires-dist = [ { name = "backoff", marker = "extra == 'vlm'" }, { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" }, { name = "datasets", specifier = ">=4.0.0" }, + { name = "diffusers" }, { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, { name = "megatron-fsdp" }, { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, + { name = "nemo-automodel", extras = ["deepep"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["fa"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["fa"], marker = "extra == 'dev'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'dev'", editable = "3rdparty/Automodel-workspace/Automodel" }, { name = "numba", marker = "extra == 'vlm'" }, { name = "numpy", marker = "extra == 'vlm'" }, + { name = "nvidia-nvshmem-cu13", marker = "extra == 'deepep'" }, + { name = "opencv-python-headless", specifier = "==4.10.0.84" }, { name = "pillow", marker = "extra == 'vlm'" }, { name = "pybind11" }, { name = "pyyaml" }, - { name = "qwen-vl-utils", extras = ["decord"], marker = "extra == 'vlm'" }, + { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" }, { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.8.0", index = "https://download.pytorch.org/whl/cu129" }, { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.8.0", index = "https://pypi.org/simple" }, { name = "torchao" }, - { name = "torchcodec", marker = "extra == 'vlm'" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "torchdata" }, + { name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'vlm'", index = "https://pypi.org/simple" }, + { name = "torchvision", marker = "sys_platform != 'darwin' and extra == 'vlm'", index = "https://download.pytorch.org/whl/cu129" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" }, - { name = "transformers", specifier = "<=4.55.4" }, - { name = "transformers", marker = "extra == 'vlm'", specifier = "<=4.55.4" }, + { name = "transformers", specifier = "<=4.57.1" }, + { name = "transformers", marker = "extra == 'vlm'", specifier = "<=4.57.1" }, { name = "wandb" }, ] -provides-extras = ["vlm", "fa", "moe"] +provides-extras = ["vlm", "fa", "moe", "deepep", "dev", "all"] [package.metadata.requires-dev] build = [ @@ -3182,6 +3282,7 @@ dependencies = [ automodel = [ { name = "causal-conv1d" }, { name = "flash-attn" }, + { name = "grouped-gemm" }, { name = "mamba-ssm" }, { name = "nemo-automodel" }, { name = "vllm" }, @@ -3257,6 +3358,7 @@ requires-dist = [ { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.8.1" }, + { name = "grouped-gemm", marker = "extra == 'automodel'", git = "https://github.com/fanshiqing/grouped_gemm?rev=v1.1.4" }, { name = "hydra-core" }, { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }, @@ -3660,6 +3762,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/49/7e1e3e98f5b8ae79f21260f9a90d8d985e5ad67b69b90b09456fc3c01a18/nvidia_nvshmem_cu12-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0032831c0ec4fdc64c3bd8daeae588f6647ee4afc3376c5871218546acac0e81", size = 139158697, upload-time = "2025-08-22T19:56:39.552Z" }, ] +[[package]] +name = "nvidia-nvshmem-cu13" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/0f/05cc9c720236dcd2db9c1ab97fff629e96821be2e63103569da0c9b72f19/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dc2a197f38e5d0376ad52cd1a2a3617d3cdc150fd5966f4aee9bcebb1d68fe9", size = 60215947, upload-time = "2025-09-06T00:32:20.022Z" }, + { url = "https://files.pythonhosted.org/packages/3c/35/a9bf80a609e74e3b000fef598933235c908fcefcef9026042b8e6dfde2a9/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:290f0a2ee94c9f3687a02502f3b9299a9f9fe826e6d0287ee18482e78d495b80", size = 60412546, upload-time = "2025-09-06T00:32:41.564Z" }, +] + [[package]] name = "nvidia-nvtx-cu12" version = "12.9.79" @@ -5135,10 +5246,10 @@ name = "qwen-vl-utils" version = "0.0.11" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "av" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "requests" }, + { name = "av", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, + { name = "packaging", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, + { name = "pillow", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, + { name = "requests", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/42/9f/1229a40ebd49f689a0252144126f3865f31bb4151e942cf781a2936f0c4d/qwen_vl_utils-0.0.11.tar.gz", hash = "sha256:083ba1e5cfa5002165b1e3bddd4d6d26d1d6d34473884033ef12ae3fe8496cd5", size = 7924, upload-time = "2025-04-21T10:38:47.461Z" } wheels = [ @@ -5147,7 +5258,7 @@ wheels = [ [package.optional-dependencies] decord = [ - { name = "decord" }, + { name = "decord", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] [[package]] @@ -6408,9 +6519,7 @@ name = "torchcodec" version = "0.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/b3/11326a0e7a3c803a95975cfce4ac88fa4ea1a0d432bb876081046c5a5554/torchcodec-0.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fba260145a239b5afe13336e3a5bc1b089c9c31a073e9a7c2026d4cbd853fdd9", size = 3482584, upload-time = "2025-08-07T08:51:32.535Z" }, { url = "https://files.pythonhosted.org/packages/a7/d1/3f90561df013f6a015ef19de22726b64073fee405f53d3c4b8255ab05a67/torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:fdef91a17fb1f1a159ce23710324a9a4e6d6a885275de73700f94a9ad562c6b2", size = 1370954, upload-time = "2025-08-07T08:51:15.021Z" }, - { url = "https://files.pythonhosted.org/packages/87/d0/0b5dd42652e4527d578e1d6239dbb907bf83e502115e517b83a55d8b7f8b/torchcodec-0.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de20cab5df7fa7cdd74ec1dc0d508324685573f86de6789f0ebb860b7ea20b33", size = 3446017, upload-time = "2025-08-07T08:51:34.484Z" }, { url = "https://files.pythonhosted.org/packages/97/62/a938334e39101d4304619b90847d8aef7d1c607c6bcf33638f72931ae990/torchcodec-0.6.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:46dab701a2d809e975a8b07d7ee47ed34f1d903511e374c74cfc1de6a5ab0e3f", size = 1374794, upload-time = "2025-08-07T08:51:17.355Z" }, ] From 8e562e80af211d50155407a2c62310af810d9c8a Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 01:39:26 +0000 Subject: [PATCH 02/32] new automodel checkpointing Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 197 ++++++++++++++++-- nemo_rl/models/policy/utils.py | 2 +- nemo_rl/utils/automodel_checkpoint.py | 6 +- nemo_rl/utils/checkpoint.py | 4 + 4 files changed, 184 insertions(+), 25 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 5745534363..1301509fa2 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -27,7 +27,7 @@ from nemo_automodel import ( NeMoAutoModelForSequenceClassification, ) -from nemo_automodel.components._transformers.utils import ( +from nemo_automodel._transformers.utils import ( sliding_window_overwrite, ) from nemo_automodel.components.distributed.cp_utils import ( @@ -88,11 +88,18 @@ import_class_from_path, resolve_model_class, ) -from nemo_rl.utils.automodel_checkpoint import ( - load_checkpoint, - save_checkpoint, -) +# from nemo_rl.utils.automodel_checkpoint import ( +# load_checkpoint as _unused_wrapper_load_checkpoint, # backward compat: keep import to avoid breaking external users +# save_checkpoint as _unused_wrapper_save_checkpoint, # not used after direct Checkpointer usage +# ) from nemo_rl.utils.checkpoint import CheckpointingConfig +from nemo_automodel.components.checkpoint.checkpointing import ( + Checkpointer, + CheckpointingConfig as AutomodelCheckpointingConfig, +) +from nemo_automodel.components.checkpoint._backports.filesystem import ( + SerializationFormat, +) from nemo_rl.utils.nsys import wrap_with_nvtx_name from nemo_rl.utils.packed_tensor import packed_broadcast_producer @@ -461,6 +468,11 @@ def __init__( "No weights path provided. Starting from scratch (default policy init)" ) + # We initialize the AutoModel checkpointer here. This needs to be persistent because of async checkpointing support + # once NeMo-RL is >= torch 2.9.0 + self.checkpointer = None + self.checkpoint_config = None + def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor: if "generation" in self.cfg and self.cfg["generation"] is not None: logits.div_(self.cfg["generation"]["temperature"]) @@ -1884,41 +1896,137 @@ def save_checkpoint( "save_consolidated", "is_peft", "peft_config", + "model_cache_dir", + "model_repo_id", + "is_async", + "dequantize_base_checkpoint", } } - save_checkpoint( + checkpoint_root = _infer_checkpoint_root(weights_path) + + # Ensure a persistent Checkpointer exists and is configured + self._ensure_checkpointer(config_updates=checkpoint_kwargs, checkpoint_root=checkpoint_root) + + self.checkpointer.save_model( model=self.model, weights_path=weights_path, - optimizer=self.optimizer if optimizer_path else None, - scheduler=self.scheduler if optimizer_path else None, - optimizer_path=optimizer_path, - tokenizer=self.tokenizer if tokenizer_path else None, - tokenizer_path=tokenizer_path, - model_state_dict_keys=self.model_state_dict_keys, - **checkpoint_kwargs, + peft_config=checkpoint_kwargs.get("peft_config"), + tokenizer=self.tokenizer if tokenizer_path is None else None, ) + if optimizer_path and self.optimizer is not None: + self.checkpointer.save_optimizer( + optimizer=self.optimizer, + model=self.model, + weights_path=optimizer_path, + scheduler=self.scheduler, + ) + + # TODO: needed? + if tokenizer_path and self.tokenizer is not None: + print(f"Saving tokenizer (or processor) to {tokenizer_path}") + self.tokenizer.save_pretrained(tokenizer_path) + def load_checkpoint( self, weights_path: str, optimizer_path: Optional[str] = None, ) -> None: - """Load a checkpoint into the model.""" - load_checkpoint( + """Load a checkpoint into the model using Automodel Checkpointer.""" + print(f"Loading weights from {weights_path}") + + model_save_format, is_peft = detect_checkpoint_format(weights_path) + + weights_dir = os.path.dirname(weights_path) + checkpoint_root = ( + os.path.dirname(weights_dir) if weights_dir.endswith("weights") else weights_dir + ) + + # Ensure a persistent Checkpointer exists and is configured + self._ensure_checkpointer( + config_updates={ + "model_save_format": model_save_format, + "is_peft": is_peft, + }, + checkpoint_root=checkpoint_root, + ) + + model_dir = weights_path if weights_path.endswith("/model") else os.path.join(weights_path, "model") + + self.checkpointer.load_model( model=self.model, - weights_path=weights_path, - optimizer=self.optimizer if optimizer_path else None, - scheduler=self.scheduler if optimizer_path else None, - optimizer_path=optimizer_path, + model_path=model_dir, ) + if optimizer_path and self.optimizer is not None: + self.checkpointer.load_optimizer( + optimizer=self.optimizer, + model=self.model, + weights_path=optimizer_path, + scheduler=self.scheduler, + ) + + def _ensure_checkpointer(self, config_updates=None, checkpoint_root: Optional[str] = None) -> None: + """Create or update a persistent Automodel Checkpointer bound to this worker ranks. + + Args: + config_updates: Dict of CheckpointingConfig fields to update. + checkpoint_root: Optional root directory for checkpoints. + """ + if config_updates is None: + config_updates = {} + + # Compute dp/tp ranks + dp_rank = torch.distributed.get_rank(self.dp_mesh.get_group()) + tp_rank = torch.distributed.get_rank(self.tp_mesh.get_group()) + pp_rank = 0 + + if self.checkpointer is None: + # Initialize a base config with sensible defaults + base_cfg = AutomodelCheckpointingConfig( + enabled=True, + checkpoint_dir=checkpoint_root or "", + model_save_format=config_updates.get("model_save_format", "safetensors"), + model_cache_dir=config_updates.get("model_cache_dir", ""), + model_repo_id=config_updates.get("model_repo_id", ""), + save_consolidated=config_updates.get("save_consolidated", False), + is_peft=config_updates.get("is_peft", False), + model_state_dict_keys=getattr(self, "model_state_dict_keys", None), + is_async=config_updates.get("is_async", False), + dequantize_base_checkpoint=config_updates.get("dequantize_base_checkpoint", False), + ) + self.checkpoint_config = base_cfg + self.checkpointer = Checkpointer( + config=base_cfg, + dp_rank=dp_rank, + tp_rank=tp_rank, + pp_rank=pp_rank, + moe_mesh=None, + ) + else: + # Update mutable config fields on the existing instance + cfg = self.checkpointer.config + if checkpoint_root is not None: + cfg.checkpoint_dir = checkpoint_root + for k, v in config_updates.items(): + if k == "model_save_format": + # Ensure enum type + v = SerializationFormat[v.upper()] if isinstance(v, str) else v + setattr(cfg, k, v) + # Ensure model_state_dict_keys is current + if getattr(self, "model_state_dict_keys", None) is not None: + cfg.model_state_dict_keys = self.model_state_dict_keys + def shutdown(self) -> None: """Shutdown the policy.""" # Clean up extension resources like ZMQ sockets if hasattr(self, "zmq_socket"): self.zmq_socket.close() self.zmq_context.term() + # Close checkpointer resources + if hasattr(self, "checkpointer") and self.checkpointer is not None: + self.checkpointer.close() def start_gpu_profiling(self) -> None: """Start GPU profiling.""" @@ -1933,3 +2041,54 @@ def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]: ip = ray._private.services.get_node_ip_address() gpu_id = ray.get_gpu_ids()[0] return (ip, gpu_id) + +def detect_checkpoint_format(weights_path: str) -> tuple[str, bool]: + """Detect model save format and PEFT status from checkpoint directory. + + Args: + weights_path: Path to the checkpoint directory (e.g., weights/model) + + Returns: + tuple: (model_save_format, is_peft) where: + model_save_format is "torch_save" for DCP or "safetensors" for safetensors + is_peft is True if PEFT/adapter patterns are detected + """ + is_peft = False + model_save_format = "safetensors" + try: + # Iterate through all subdirectories and files recursively + all_files = [] + for root, dirs, files in os.walk(weights_path): + all_files.extend(files) + + if any(f.endswith(".distcp") for f in all_files): + model_save_format = "torch_save" + elif any(f.endswith(".safetensors") for f in all_files): + model_save_format = "safetensors" + elif any(f.endswith((".bin", ".pt", ".pth")) for f in all_files): + model_save_format = "torch_save" + + if not is_peft: + is_peft = any("adapter" in f.lower() for f in all_files) + + except (OSError, PermissionError): + pass + + return model_save_format, is_peft + +def _infer_checkpoint_root(weights_path: str) -> str: + """Infer checkpoint root directory from weights path. + + When weights_path ends with "…/weights/model", we need the parent of + the weights directory (the checkpoint root), not the weights directory itself. + + Args: + weights_path: Path to model weights (e.g., "/path/to/policy/weights/model") + + Returns: + str: Checkpoint root directory (e.g., "/path/to/policy") + """ + weights_dir = os.path.dirname(weights_path) + if weights_dir.endswith("weights"): + return os.path.dirname(weights_dir) + return weights_dir \ No newline at end of file diff --git a/nemo_rl/models/policy/utils.py b/nemo_rl/models/policy/utils.py index c9c4c5a1bb..e435e97a87 100644 --- a/nemo_rl/models/policy/utils.py +++ b/nemo_rl/models/policy/utils.py @@ -29,7 +29,7 @@ # Try to import nemo_automodel classes, fallback to None if not available try: - from nemo_automodel.components._transformers.auto_model import ( + from nemo_automodel._transformers.auto_model import ( NeMoAutoModelForCausalLM, NeMoAutoModelForImageTextToText, NeMoAutoModelForTextToWaveform, diff --git a/nemo_rl/utils/automodel_checkpoint.py b/nemo_rl/utils/automodel_checkpoint.py index a9f0793851..5a0c6376ec 100644 --- a/nemo_rl/utils/automodel_checkpoint.py +++ b/nemo_rl/utils/automodel_checkpoint.py @@ -25,13 +25,9 @@ # Apply torch backports for compatibility with torch==2.7.1 from nemo_automodel.components.checkpoint._torch_backports import apply_patches -# Import from nemo-automodel from nemo_automodel.components.checkpoint.checkpointing import ( CheckpointingConfig, - load_model, - load_optimizer, - save_model, - save_optimizer, + Checkpointer, ) # Apply torch backports for compatibility with torch==2.7.1 diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py index 05e0ee2f3a..f3860a3d7e 100644 --- a/nemo_rl/utils/checkpoint.py +++ b/nemo_rl/utils/checkpoint.py @@ -65,6 +65,8 @@ class CheckpointingConfig(TypedDict): model_repo_id: NotRequired[str] # Default: "" is_peft: NotRequired[bool] # Default: False peft_config: NotRequired[Any] # Default: None + is_async: NotRequired[bool] # Default: False + dequantize_base_checkpoint: NotRequired[bool] # Default: False class CheckpointManager: @@ -105,6 +107,8 @@ def __init__(self, config: CheckpointingConfig): self.model_cache_dir = config.get("model_cache_dir", "") self.model_repo_id = config.get("model_repo_id", "") self.is_peft = config.get("is_peft", False) + self.is_async = config.get("is_async", False) + self.dequantize_base_checkpoint = config.get("dequantize_base_checkpoint", False) def init_tmp_checkpoint( self, From 5a1cff16a33515de597f5d040d42d9ca8a739b4d Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 06:52:02 +0000 Subject: [PATCH 03/32] adding automodel sharding Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 186 ++++++++---------- 1 file changed, 77 insertions(+), 109 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 1301509fa2..f610b7aae4 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -41,6 +41,9 @@ from nemo_automodel.components.distributed.parallelizer import ( fsdp2_strategy_parallelize, ) +from nemo_automodel.components.distributed.fsdp2 import ( + FSDP2Manager, +) from nemo_automodel.components.distributed.tensor_utils import ( get_cpu_state_dict, to_local_if_dtensor, @@ -103,6 +106,8 @@ from nemo_rl.utils.nsys import wrap_with_nvtx_name from nemo_rl.utils.packed_tensor import packed_broadcast_producer +from transformers.utils import TRANSFORMERS_CACHE + @ray.remote( runtime_env=get_runtime_env_for_policy_worker("dtensor_policy_worker_v2") @@ -156,6 +161,11 @@ def __init__( world_size = torch.distributed.get_world_size() model_name = self.cfg["model_name"] + # We initialize the AutoModel checkpointer here. This needs to be persistent because of async checkpointing support + # once NeMo-RL is >= torch 2.9.0 + self.checkpointer = None + self.checkpoint_config = None + self.cpu_offload = self.cfg["dtensor_cfg"]["cpu_offload"] self.max_grad_norm = self.cfg["max_grad_norm"] @@ -181,6 +191,14 @@ def __init__( hf_config_overrides = self.cfg.get("hf_config_overrides", {}) or {} + # Choose attention implementation consistent with train_ft.py logic + # - Packed sequence requires FA2 and CP must be 1 + # - CP > 1 requires SDPA + cp_size_cfg = self.cfg["dtensor_cfg"]["context_parallel_size"] + attn_impl = ( + "flash_attention_2" if (self.enable_seq_packing and cp_size_cfg == 1) else ("sdpa" if cp_size_cfg > 1 else None) + ) + model_config = AutoConfig.from_pretrained( model_name, # Always load the model in float32 to keep master weights in float32. @@ -190,9 +208,7 @@ def __init__( **sliding_window_overwrite( model_name ), # due to https://github.com/huggingface/transformers/issues/38002 - attn_implementation="flash_attention_2" - if self.enable_seq_packing - else None, + attn_implementation=attn_impl, **hf_config_overrides, ) @@ -233,24 +249,6 @@ def __init__( # DO NOT assume AutoModelForCausalLM, multimodal models can inherit from AutoModelForImageTextToText, AutoModelForTextToWaveform, etc. model_class = resolve_model_class(model_config.model_type) - full_state_dict = None - model_state_dict_keys = None - if self.rank == 0: - print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") - model = model_class.from_pretrained( - model_name, - device_map="cpu", # load weights onto CPU initially - trust_remote_code=True, - config=model_config, - use_liger_kernel=False, - torch_dtype=str(model_config.torch_dtype), - ) - - full_state_dict = model.state_dict() - # Store the original model state dict keys before any parallelization - model_state_dict_keys = list(full_state_dict.keys()) - del model - print(f"[Rank {self.rank}] Initializing empty model for FSDP...") # All ranks initialize model on meta device, so FSDP can shard it. # The actual weights will be broadcast from rank 0. @@ -261,28 +259,26 @@ def __init__( # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180 self.model = model_class.from_config( model_config, - attn_implementation="flash_attention_2" - if self.enable_seq_packing - else None, + attn_implementation=attn_impl, use_liger_kernel=False, trust_remote_code=True, torch_dtype=str(model_config.torch_dtype), ) + # Hold a copy of model state_dict keys before any parallelization (as in train_ft.py) + self.model_state_dict_keys = list(self.model.state_dict().keys()) + if self.model.config.pad_token_id is None: self.model.config.pad_token_id = tokenizer.pad_token_id - tp_size = self.cfg["dtensor_cfg"]["tensor_parallel_size"] - cp_size = self.cfg["dtensor_cfg"]["context_parallel_size"] + tp_size = self.cfg["dtensor_cfg"].get("tensor_parallel_size", 1) + cp_size = self.cfg["dtensor_cfg"].get("context_parallel_size", 1) + ep_size = self.cfg["dtensor_cfg"].get("expert_parallel_size", 1) if cp_size > 1 and self.enable_seq_packing: raise ValueError( "Context parallel is not supported for sequence packing. Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." ) - dp_size = world_size // tp_size // cp_size sequence_parallel_enabled = self.cfg["dtensor_cfg"]["sequence_parallel"] - assert world_size == dp_size * tp_size * cp_size, ( - f"World size({world_size}) must equal to dp_size({dp_size}) * tp_size({tp_size}) * cp_size({cp_size}) to use DTensor" - ) if sequence_parallel_enabled and tp_size == 1: print( @@ -309,87 +305,67 @@ def __init__( "Context parallel is yet not supported for VLM models. Please set cp_size = 1 to train VLM models." ) - # For FSDP2 compatibility, we need to support HSDP structure - # For now, we use dp_replicate_size = 1 (no hybrid sharding) - dp_replicate_size = 1 - dp_shard_size = dp_size - - # torch==2.8 uses LOCAL_RANK to set the device here (https://github.com/pytorch/pytorch/blob/ba56102387ef21a3b04b357e5b183d48f0afefc7/torch/distributed/device_mesh.py#L500), - # but CUDA_VISIBLE_DEVICES is set to only 1 gpu, so we need to temporarily set LOCAL_RANK to 0. - # TODO: consider changing the default LOCAL_RANK set in worker_groups.py - prev_local_rank = os.environ["LOCAL_RANK"] - os.environ["LOCAL_RANK"] = "0" - - # Create device mesh with HSDP structure for FSDP2 compatibility - device_mesh = torch.distributed.device_mesh.init_device_mesh( - "cuda", - (dp_replicate_size, dp_shard_size, cp_size, tp_size), - mesh_dim_names=("dp_replicate", "dp_shard", "cp", "tp"), - ) - os.environ["LOCAL_RANK"] = prev_local_rank - - # Create flattened submeshes for different use cases - # Flatten dp_replicate + dp_shard for the "dp" dimension (backward compatibility) - device_mesh[("dp_replicate", "dp_shard")]._flatten(mesh_dim_name="dp") - - # Flatten dp_shard + cp for FSDP2 sharding - device_mesh[("dp_shard", "cp")]._flatten(mesh_dim_name="dp_shard_cp") - - # Flatten dp_replicate + dp_shard + cp for gradient operations - device_mesh[("dp_replicate", "dp_shard", "cp")]._flatten(mesh_dim_name="dp_cp") - - # Store mesh references for backward compatibility - self.dp_cp_mesh = device_mesh["dp_cp"] - self.dp_mesh = device_mesh["dp"] - self.tp_mesh = device_mesh["tp"] - self.cp_mesh = device_mesh["cp"] - - self.dp_size = dp_size - self.tp_size = tp_size - self.cp_size = cp_size - self.device_mesh = device_mesh - # ------------------------------------------------ - # 3) Move to GPU + Composable FSDP - # (Initialize device mesh, shard submodules, then shard entire model) + # Build device mesh and parallelize # ------------------------------------------------ - self.model = fsdp2_strategy_parallelize( - self.model, - device_mesh=self.device_mesh, + manager = FSDP2Manager( + dp_size=None, + dp_replicate_size=1, + tp_size=tp_size, + cp_size=cp_size, + ep_size=ep_size, + pp_size=1, + sequence_parallel=sequence_parallel_enabled, + use_hf_tp_plan=self.cfg["dtensor_cfg"].get("use_hf_tp_plan", False), mp_policy=MixedPrecisionPolicy( param_dtype=self.dtype, reduce_dtype=torch.float32, output_dtype=torch.float32, ), - offload_policy=CPUOffloadPolicy(pin_memory=False) - if self.cpu_offload - else OffloadPolicy(), - sequence_parallel=sequence_parallel_enabled, - activation_checkpointing=self.cfg["dtensor_cfg"][ - "activation_checkpointing" - ], - tp_shard_plan=self.cfg["dtensor_cfg"]["custom_parallel_plan"], - dp_replicate_mesh_name="dp_replicate", - dp_shard_cp_mesh_name="dp_shard_cp", - tp_mesh_name="tp", + offload_policy=CPUOffloadPolicy(pin_memory=False) if self.cpu_offload else None, + backend="nccl", + world_size=world_size, + activation_checkpointing=self.cfg["dtensor_cfg"]["activation_checkpointing"], ) - print(f"[Rank {self.rank}] Loading state dict from rank 0...") - # This will broadcast the state dict from rank 0 to all other ranks - # and load it into the FSDP model. - set_model_state_dict( - self.model, - model_state_dict=full_state_dict, - options=StateDictOptions( - full_state_dict=True, - broadcast_from_rank0=True, - ), + # Store mesh references for downstream usage + self.device_mesh = manager.device_mesh + self.dp_cp_mesh = self.device_mesh["dp_cp"] + self.dp_mesh = self.device_mesh["dp"] + self.tp_mesh = self.device_mesh["tp"] + self.cp_mesh = self.device_mesh["cp"] + self.moe_mesh = getattr(manager, "moe_mesh", None) + + self.dp_size = manager.dp_size + self.tp_size = manager.tp_size + self.cp_size = manager.cp_size + + # Parallelize model (FSDP2 + TP plan) + self.model = manager.parallelize(self.model) + + # Load base model weights across all ranks using Automodel Checkpointer + # This mirrors build_model_and_optimizer's is_meta_device + load_weights path + self._ensure_checkpointer( + config_updates={ + "model_repo_id": model_name, + "model_cache_dir": hf_config_overrides.get("cache_dir", ""), + "save_consolidated": False, + "is_peft": False, + }, + checkpoint_root=None, ) + self.checkpointer.config.model_state_dict_keys = self.model_state_dict_keys - # Broadcast model state dict keys to all ranks and store as instance variable - keys_to_broadcast = [model_state_dict_keys] - torch.distributed.broadcast_object_list(keys_to_broadcast, src=0) - self.model_state_dict_keys = keys_to_broadcast[0] + # Load base HF weights unless an explicit checkpoint is provided later + # This puts shards directly into the parallelized model + self.checkpointer.load_base_model( + self.model, + device=torch.cuda.current_device(), + root_dir=hf_config_overrides.get("cache_dir", TRANSFORMERS_CACHE), + model_name=model_name, + peft_init_method=None, # TODO: change for LoRA + load_base_model=True, + ) # Handle tied word embeddings after loading the state dict # We need to actually tie the parameters at the model level @@ -406,10 +382,6 @@ def __init__( if embed_tokens_weight is not None: self.model.lm_head.weight = embed_tokens_weight - # Manually broadcast buffers - for _, buf in self.model.named_buffers(): - torch.distributed.broadcast(to_local_if_dtensor(buf), src=0) - if self.cpu_offload: self.model = self.move_to_device(self.model, "cpu") @@ -465,13 +437,9 @@ def __init__( self.load_checkpoint(weights_path, optimizer_path) else: print( - "No weights path provided. Starting from scratch (default policy init)" + "No weights path provided. Loaded base HF weights via Checkpointer (default policy init)" ) - # We initialize the AutoModel checkpointer here. This needs to be persistent because of async checkpointing support - # once NeMo-RL is >= torch 2.9.0 - self.checkpointer = None - self.checkpoint_config = None def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor: if "generation" in self.cfg and self.cfg["generation"] is not None: From acff74752e71f5cae29e19560003beb39f45600b Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 07:44:57 +0000 Subject: [PATCH 04/32] adding moe init Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 39 +++++++++++++++---- nemo_rl/utils/checkpoint.py | 2 - 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index f610b7aae4..3a9f7124f3 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -30,6 +30,8 @@ from nemo_automodel._transformers.utils import ( sliding_window_overwrite, ) +from nemo_automodel.components.moe.parallelizer import parallelize_model as moe_parallelize_model +from nemo_automodel.components.config.loader import _resolve_target from nemo_automodel.components.distributed.cp_utils import ( create_context_parallel_ctx, get_train_context, @@ -107,6 +109,7 @@ from nemo_rl.utils.packed_tensor import packed_broadcast_producer from transformers.utils import TRANSFORMERS_CACHE +from transformers import PreTrainedModel @ray.remote( @@ -340,17 +343,33 @@ def __init__( self.tp_size = manager.tp_size self.cp_size = manager.cp_size - # Parallelize model (FSDP2 + TP plan) - self.model = manager.parallelize(self.model) + # Parallelize model + if not isinstance(self.model, PreTrainedModel): + moe_parallelize_model( + model=self.model, + world_mesh=self.device_mesh, + moe_mesh=self.moe_mesh, + pp_enabled=False, + dp_axis_names=( + ("dp_replicate", "dp_shard_cp") + if "dp_replicate" in self.device_mesh.mesh_dim_names + and "dp_shard_cp" in self.device_mesh.mesh_dim_names + else ("dp_shard_cp",) + ), + cp_axis_name="cp", + tp_axis_name="tp", + ep_axis_name="ep", + ep_shard_axis_names=("ep_shard",), + ) + else: + self.model = manager.parallelize(self.model) # Load base model weights across all ranks using Automodel Checkpointer # This mirrors build_model_and_optimizer's is_meta_device + load_weights path self._ensure_checkpointer( config_updates={ "model_repo_id": model_name, - "model_cache_dir": hf_config_overrides.get("cache_dir", ""), - "save_consolidated": False, - "is_peft": False, + "dequantize_base_checkpoint": self.cfg.get("dequantize_base_checkpoint", False), }, checkpoint_root=None, ) @@ -703,10 +722,14 @@ def train( outputs = self.model(**model_args) # Get logprobs - if not hasattr(outputs, "logits"): - logits = self.model.lm_head(outputs.last_hidden_state) + if isinstance(outputs, (torch.Tensor, DTensor)): + # custom models (e.g., those coming from AutoModel) can output logits directly + logits = outputs else: - logits = outputs.logits + if not hasattr(outputs, "logits"): + logits = self.model.lm_head(outputs.last_hidden_state) + else: + logits = outputs.logits del outputs # Apply temperature scaling diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py index f3860a3d7e..91c62e284d 100644 --- a/nemo_rl/utils/checkpoint.py +++ b/nemo_rl/utils/checkpoint.py @@ -66,7 +66,6 @@ class CheckpointingConfig(TypedDict): is_peft: NotRequired[bool] # Default: False peft_config: NotRequired[Any] # Default: None is_async: NotRequired[bool] # Default: False - dequantize_base_checkpoint: NotRequired[bool] # Default: False class CheckpointManager: @@ -108,7 +107,6 @@ def __init__(self, config: CheckpointingConfig): self.model_repo_id = config.get("model_repo_id", "") self.is_peft = config.get("is_peft", False) self.is_async = config.get("is_async", False) - self.dequantize_base_checkpoint = config.get("dequantize_base_checkpoint", False) def init_tmp_checkpoint( self, From 0cbc3ac4e861173fd05ebede47a6dc634cc5ef1d Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 07:46:17 +0000 Subject: [PATCH 05/32] fix Signed-off-by: adil-a --- nemo_rl/utils/checkpoint.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py index 91c62e284d..05e0ee2f3a 100644 --- a/nemo_rl/utils/checkpoint.py +++ b/nemo_rl/utils/checkpoint.py @@ -65,7 +65,6 @@ class CheckpointingConfig(TypedDict): model_repo_id: NotRequired[str] # Default: "" is_peft: NotRequired[bool] # Default: False peft_config: NotRequired[Any] # Default: None - is_async: NotRequired[bool] # Default: False class CheckpointManager: @@ -106,7 +105,6 @@ def __init__(self, config: CheckpointingConfig): self.model_cache_dir = config.get("model_cache_dir", "") self.model_repo_id = config.get("model_repo_id", "") self.is_peft = config.get("is_peft", False) - self.is_async = config.get("is_async", False) def init_tmp_checkpoint( self, From 3336fe94ac58fe923b16f96717e827532c5f123d Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 07:55:59 +0000 Subject: [PATCH 06/32] removing legacy checkpointing utils Signed-off-by: adil-a --- nemo_rl/utils/automodel_checkpoint.py | 236 -------------------------- 1 file changed, 236 deletions(-) delete mode 100644 nemo_rl/utils/automodel_checkpoint.py diff --git a/nemo_rl/utils/automodel_checkpoint.py b/nemo_rl/utils/automodel_checkpoint.py deleted file mode 100644 index 5a0c6376ec..0000000000 --- a/nemo_rl/utils/automodel_checkpoint.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Checkpoint management utilities for HF models.""" - -import os -from typing import Any, Optional - -import torch -from nemo_automodel.components.checkpoint._backports.filesystem import ( - SerializationFormat, -) - -# Apply torch backports for compatibility with torch==2.7.1 -from nemo_automodel.components.checkpoint._torch_backports import apply_patches - -from nemo_automodel.components.checkpoint.checkpointing import ( - CheckpointingConfig, - Checkpointer, -) - -# Apply torch backports for compatibility with torch==2.7.1 -apply_patches() - - -def _infer_checkpoint_root(weights_path: str) -> str: - """Infer checkpoint root directory from weights path. - - When weights_path ends with "…/weights/model", we need the parent of - the weights directory (the checkpoint root), not the weights directory itself. - - Args: - weights_path: Path to model weights (e.g., "/path/to/policy/weights/model") - - Returns: - str: Checkpoint root directory (e.g., "/path/to/policy") - """ - weights_dir = os.path.dirname(weights_path) - if weights_dir.endswith("weights"): - return os.path.dirname(weights_dir) - return weights_dir - - -def detect_checkpoint_format(weights_path: str) -> tuple[str, bool]: - """Detect model save format and PEFT status from checkpoint directory. - - Args: - weights_path: Path to the checkpoint directory (e.g., weights/model) - - Returns: - tuple: (model_save_format, is_peft) where: - model_save_format is "torch_save" for DCP or "safetensors" for safetensors - is_peft is True if PEFT/adapter patterns are detected - """ - is_peft = False - model_save_format = "safetensors" - try: - # Iterate through all subdirectories and files recursively - all_files = [] - for root, dirs, files in os.walk(weights_path): - all_files.extend(files) - - if any(f.endswith(".distcp") for f in all_files): - model_save_format = "torch_save" - elif any(f.endswith(".safetensors") for f in all_files): - model_save_format = "safetensors" - elif any(f.endswith((".bin", ".pt", ".pth")) for f in all_files): - model_save_format = "torch_save" - - if not is_peft: - is_peft = any("adapter" in f.lower() for f in all_files) - - except (OSError, PermissionError): - pass - - return model_save_format, is_peft - - -def save_checkpoint( - model: torch.nn.Module, - weights_path: str, - optimizer: Optional[torch.optim.Optimizer] = None, - scheduler: Optional[Any] = None, - optimizer_path: Optional[str] = None, - tokenizer: Optional[Any] = None, - tokenizer_path: Optional[str] = None, - model_save_format: str = "safetensors", - is_peft: bool = False, - peft_config: Optional[Any] = None, - save_consolidated: bool = False, - model_state_dict_keys: Optional[list[str]] = None, -) -> None: - """Save a checkpoint of the model and optionally optimizer state. - - Args: - model: The PyTorch model to save - weights_path: Path to save model weights - optimizer: Optional optimizer to save - scheduler: Optional scheduler to save - optimizer_path: Path to save optimizer state (required if optimizer provided) - tokenizer: Optional tokenizer to save - tokenizer_path: Path to save tokenizer state (required if tokenizer provided) - model_save_format: Format for saving model ("torch_save" or "safetensors") - is_peft: Whether the model uses PEFT - peft_config: PEFT configuration if is_peft is True - save_consolidated: Whether to save consolidated checkpoints (for HF compatibility) - model_state_dict_keys: Copy of the model state dict keys before any parallelization. - If None, will be extracted from the model's current state dict. - """ - # Create checkpoint config - - # Extract model state dict keys if not provided - if model_state_dict_keys is None: - model_state_dict_keys = list(model.state_dict().keys()) - - valid_formats = {"safetensors", "torch_save"} - if model_save_format not in valid_formats: - raise ValueError( - f"Unsupported model_save_format='{model_save_format}'. " - f"Expected one of {sorted(valid_formats)}." - ) - - # Ensure target directories exist - os.makedirs(weights_path, exist_ok=True) - if optimizer_path: - os.makedirs(optimizer_path, exist_ok=True) - if tokenizer_path: - os.makedirs(tokenizer_path, exist_ok=True) - - checkpoint_config = CheckpointingConfig( - enabled=True, - checkpoint_dir=_infer_checkpoint_root(weights_path), - model_save_format=model_save_format, - model_cache_dir="", - model_repo_id="", - save_consolidated=save_consolidated, - is_peft=is_peft, - model_state_dict_keys=model_state_dict_keys, - ) - - # Save model using nemo-automodel API - save_model( - model=model, - weights_path=weights_path, - checkpoint_config=checkpoint_config, - peft_config=peft_config, - tokenizer=tokenizer if tokenizer_path is None else None, - ) - - # Save optimizer if provided - if optimizer is not None: - if optimizer_path is None: - raise ValueError( - "optimizer_path must be provided when saving optimizer state" - ) - save_optimizer( - optimizer=optimizer, - model=model, - weights_path=optimizer_path, - scheduler=scheduler, - ) - - # Save tokenizer separately if tokenizer_path provided - if tokenizer is not None and tokenizer_path is not None: - print(f"Saving tokenizer (or processor) to {tokenizer_path}") - tokenizer.save_pretrained(tokenizer_path) - - -def load_checkpoint( - model: torch.nn.Module, - weights_path: str, - optimizer: Optional[torch.optim.Optimizer] = None, - scheduler: Optional[Any] = None, - optimizer_path: Optional[str] = None, -) -> None: - """Load a model weights and optionally optimizer state. - - Args: - model: The PyTorch model whose weights to update - weights_path: Path to load model weights from - optimizer: Optional optimizer to load state into - scheduler: Optional scheduler to load state into - optimizer_path: Path to load optimizer state from (required if optimizer provided) - """ - print(f"Loading weights from {weights_path}") - - model_save_format, is_peft = detect_checkpoint_format(weights_path) - - try: - format_enum = SerializationFormat[model_save_format.upper()] - - # append /model to the weights_path if it doesn't exist - # TODO: remove this once nemo-automodel is updated - if not weights_path.endswith("/model"): - weights_path = os.path.join(weights_path, "model") - - # Load model using nemo-automodel API - load_model( - model=model, - model_path=weights_path, - model_save_format=format_enum, - is_peft=is_peft, - ) - except FileNotFoundError as e: - msg = ( - f"Failed to load model from '{weights_path}': {e}\n" - "Note: DTensorPolicyWorkerV2 expects:\n" - " - Model shards under '/weights/model'\n" - " - Optimizer states under '/optimizer/optim'\n" - "Please verify your checkpoint layout." - ) - raise FileNotFoundError(msg) from e - - if optimizer is not None: - if optimizer_path is None: - raise ValueError( - "optimizer_path must be provided when loading optimizer state" - ) - print(f"Loading optimizer from {optimizer_path}") - load_optimizer( - optimizer=optimizer, - model=model, - weights_path=optimizer_path, - scheduler=scheduler, - ) From 19d29aaa4a417d18223b6d6c79df87a3d003e527 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 07:56:11 +0000 Subject: [PATCH 07/32] linting Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 3a9f7124f3..036c4003ea 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -30,8 +30,9 @@ from nemo_automodel._transformers.utils import ( sliding_window_overwrite, ) -from nemo_automodel.components.moe.parallelizer import parallelize_model as moe_parallelize_model -from nemo_automodel.components.config.loader import _resolve_target +from nemo_automodel.components.moe.parallelizer import ( + parallelize_model as moe_parallelize_model, +) from nemo_automodel.components.distributed.cp_utils import ( create_context_parallel_ctx, get_train_context, @@ -40,9 +41,6 @@ clip_grad_by_total_norm_, get_grad_norm, ) -from nemo_automodel.components.distributed.parallelizer import ( - fsdp2_strategy_parallelize, -) from nemo_automodel.components.distributed.fsdp2 import ( FSDP2Manager, ) @@ -51,14 +49,9 @@ to_local_if_dtensor, ) from torch import nn -from torch.distributed.checkpoint.state_dict import ( - StateDictOptions, - set_model_state_dict, -) from torch.distributed.fsdp import ( CPUOffloadPolicy, MixedPrecisionPolicy, - OffloadPolicy, ) from torch.distributed.tensor import DTensor, Shard from transformers import ( @@ -93,10 +86,7 @@ import_class_from_path, resolve_model_class, ) -# from nemo_rl.utils.automodel_checkpoint import ( -# load_checkpoint as _unused_wrapper_load_checkpoint, # backward compat: keep import to avoid breaking external users -# save_checkpoint as _unused_wrapper_save_checkpoint, # not used after direct Checkpointer usage -# ) + from nemo_rl.utils.checkpoint import CheckpointingConfig from nemo_automodel.components.checkpoint.checkpointing import ( Checkpointer, @@ -199,7 +189,9 @@ def __init__( # - CP > 1 requires SDPA cp_size_cfg = self.cfg["dtensor_cfg"]["context_parallel_size"] attn_impl = ( - "flash_attention_2" if (self.enable_seq_packing and cp_size_cfg == 1) else ("sdpa" if cp_size_cfg > 1 else None) + "flash_attention_2" + if (self.enable_seq_packing and cp_size_cfg == 1) + else ("sdpa" if cp_size_cfg > 1 else None) ) model_config = AutoConfig.from_pretrained( @@ -325,10 +317,14 @@ def __init__( reduce_dtype=torch.float32, output_dtype=torch.float32, ), - offload_policy=CPUOffloadPolicy(pin_memory=False) if self.cpu_offload else None, + offload_policy=CPUOffloadPolicy(pin_memory=False) + if self.cpu_offload + else None, backend="nccl", world_size=world_size, - activation_checkpointing=self.cfg["dtensor_cfg"]["activation_checkpointing"], + activation_checkpointing=self.cfg["dtensor_cfg"][ + "activation_checkpointing" + ], ) # Store mesh references for downstream usage @@ -369,7 +365,9 @@ def __init__( self._ensure_checkpointer( config_updates={ "model_repo_id": model_name, - "dequantize_base_checkpoint": self.cfg.get("dequantize_base_checkpoint", False), + "dequantize_base_checkpoint": self.cfg.get( + "dequantize_base_checkpoint", False + ), }, checkpoint_root=None, ) @@ -459,7 +457,6 @@ def __init__( "No weights path provided. Loaded base HF weights via Checkpointer (default policy init)" ) - def _apply_temperature_scaling(self, logits: torch.Tensor) -> torch.Tensor: if "generation" in self.cfg and self.cfg["generation"] is not None: logits.div_(self.cfg["generation"]["temperature"]) @@ -1897,7 +1894,9 @@ def save_checkpoint( checkpoint_root = _infer_checkpoint_root(weights_path) # Ensure a persistent Checkpointer exists and is configured - self._ensure_checkpointer(config_updates=checkpoint_kwargs, checkpoint_root=checkpoint_root) + self._ensure_checkpointer( + config_updates=checkpoint_kwargs, checkpoint_root=checkpoint_root + ) self.checkpointer.save_model( model=self.model, @@ -1931,7 +1930,9 @@ def load_checkpoint( weights_dir = os.path.dirname(weights_path) checkpoint_root = ( - os.path.dirname(weights_dir) if weights_dir.endswith("weights") else weights_dir + os.path.dirname(weights_dir) + if weights_dir.endswith("weights") + else weights_dir ) # Ensure a persistent Checkpointer exists and is configured @@ -1943,7 +1944,11 @@ def load_checkpoint( checkpoint_root=checkpoint_root, ) - model_dir = weights_path if weights_path.endswith("/model") else os.path.join(weights_path, "model") + model_dir = ( + weights_path + if weights_path.endswith("/model") + else os.path.join(weights_path, "model") + ) self.checkpointer.load_model( model=self.model, @@ -1958,7 +1963,9 @@ def load_checkpoint( scheduler=self.scheduler, ) - def _ensure_checkpointer(self, config_updates=None, checkpoint_root: Optional[str] = None) -> None: + def _ensure_checkpointer( + self, config_updates=None, checkpoint_root: Optional[str] = None + ) -> None: """Create or update a persistent Automodel Checkpointer bound to this worker ranks. Args: @@ -1978,14 +1985,18 @@ def _ensure_checkpointer(self, config_updates=None, checkpoint_root: Optional[st base_cfg = AutomodelCheckpointingConfig( enabled=True, checkpoint_dir=checkpoint_root or "", - model_save_format=config_updates.get("model_save_format", "safetensors"), + model_save_format=config_updates.get( + "model_save_format", "safetensors" + ), model_cache_dir=config_updates.get("model_cache_dir", ""), model_repo_id=config_updates.get("model_repo_id", ""), save_consolidated=config_updates.get("save_consolidated", False), is_peft=config_updates.get("is_peft", False), model_state_dict_keys=getattr(self, "model_state_dict_keys", None), is_async=config_updates.get("is_async", False), - dequantize_base_checkpoint=config_updates.get("dequantize_base_checkpoint", False), + dequantize_base_checkpoint=config_updates.get( + "dequantize_base_checkpoint", False + ), ) self.checkpoint_config = base_cfg self.checkpointer = Checkpointer( @@ -2033,6 +2044,7 @@ def report_node_ip_and_gpu_id(self) -> list[tuple[str, int]]: gpu_id = ray.get_gpu_ids()[0] return (ip, gpu_id) + def detect_checkpoint_format(weights_path: str) -> tuple[str, bool]: """Detect model save format and PEFT status from checkpoint directory. @@ -2067,6 +2079,7 @@ def detect_checkpoint_format(weights_path: str) -> tuple[str, bool]: return model_save_format, is_peft + def _infer_checkpoint_root(weights_path: str) -> str: """Infer checkpoint root directory from weights path. @@ -2082,4 +2095,4 @@ def _infer_checkpoint_root(weights_path: str) -> str: weights_dir = os.path.dirname(weights_path) if weights_dir.endswith("weights"): return os.path.dirname(weights_dir) - return weights_dir \ No newline at end of file + return weights_dir From dcb4cb2e4237f8410b89a77e981b1bb32f7c4e5f Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 15:42:12 +0000 Subject: [PATCH 08/32] adding moe check Signed-off-by: adil-a --- nemo_rl/models/policy/dtensor_policy_worker_v2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 036c4003ea..ce0b58dbef 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -340,7 +340,8 @@ def __init__( self.cp_size = manager.cp_size # Parallelize model - if not isinstance(self.model, PreTrainedModel): + is_moe_model = any(["expert" in key for key in self.model_state_dict_keys]) + if not isinstance(self.model, PreTrainedModel) and is_moe_model: moe_parallelize_model( model=self.model, world_mesh=self.device_mesh, From 738338f091eebfdb5da7fe629642f9a9952f119e Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 09:03:46 -0700 Subject: [PATCH 09/32] automodel Signed-off-by: adil-a --- 3rdparty/Automodel-workspace/Automodel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index b27761ba52..a2db048383 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit b27761ba52422d3835428f0c0e05398e58ae2ae2 +Subproject commit a2db048383cd54b3fafc928df4c30bf7bbf7c430 From 62acdfc13884587a6c320378de564a3a1f672325 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 09:51:06 -0700 Subject: [PATCH 10/32] latest automodel bump Signed-off-by: adil-a --- 3rdparty/Automodel-workspace/Automodel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index a2db048383..3e6952d5a1 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit a2db048383cd54b3fafc928df4c30bf7bbf7c430 +Subproject commit 3e6952d5a104c6ff36f0ee2cd539aeac022d6da7 From b6a3fdd1f65af2b537e1e22241feb79aef7824fb Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 29 Oct 2025 23:04:43 -0700 Subject: [PATCH 11/32] changes Signed-off-by: adil-a --- examples/configs/sft_automodel.yaml | 216 ++++++++++++++++++ .../models/policy/dtensor_policy_worker_v2.py | 22 +- 2 files changed, 234 insertions(+), 4 deletions(-) create mode 100644 examples/configs/sft_automodel.yaml diff --git a/examples/configs/sft_automodel.yaml b/examples/configs/sft_automodel.yaml new file mode 100644 index 0000000000..89bc9bb85d --- /dev/null +++ b/examples/configs/sft_automodel.yaml @@ -0,0 +1,216 @@ +# SFT Algorithm Configuration +sft: + ## total number of steps to train will equal + ## min((max_num_epochs * len(train_dataloader)), max_num_steps) + max_num_epochs: 1 + max_num_steps: 60 + + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 + +checkpointing: + enabled: false + checkpoint_dir: "results/sft" + metric_name: "val_loss" ## set to null to save most recent k checkpoints + higher_is_better: false + keep_top_k: 3 + save_period: 10 + checkpoint_must_save_by: null + +policy: + model_name: "/adasif/models/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/" + # model_name: "/adasif/models/models--moonshotai--Moonlight-16B-A3B/snapshots/ce8bc137e6e29c3b7540ebdd515bbc5bdb20d915/" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + # chat_template can be a Jinja template string or path to a .jinja file + chat_template: "{% for message in messages %}{%- if message['role'] == 'system' %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user' %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant' %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}" + chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + train_global_batch_size: 128 + train_micro_batch_size: 8 + max_total_sequence_length: 512 + precision: "bfloat16" + dequantize_base_checkpoint: true + + automodel_model_kwargs: + use_liger_kernel: false + backend: + _target_: nemo_automodel.components.moe.utils.BackendConfig + attn: flex + linear: te + rms_norm: te + enable_deepep: false + fake_balanced_gate: false + enable_hf_state_dict_adapter: true + + dtensor_cfg: + enabled: true + _v2: true + cpu_offload: False + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + context_parallel_size: 1 + expert_parallel_size: 8 + data_parallel_size: 8 + custom_parallel_plan: null + + dynamic_batching: + enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + sequence_length_round: 64 + + sequence_packing: + enabled: false + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} + max_grad_norm: 1.0 + + optimizer: + name: "torch.optim.AdamW" + kwargs: + lr: 5.0e-6 + weight_decay: 0.1 + betas: [0.9, 0.98] + eps: 1e-5 + # when using Dtensor, we need to set foreach + # and fused to False + foreach: False + fused: False + + # ignored since enabled=false, but needed for testing purposes + megatron_cfg: + enabled: false + empty_unused_memory_level: 1 + activation_checkpointing: false + tensor_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + context_parallel_size: 1 + pipeline_dtype: ${policy.precision} + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + sequence_parallel: false + freeze_moe_router: false + moe_router_dtype: null + moe_router_load_balancing_type: "aux_loss" + moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + # gives ~25% training perf speedup with sequence packing and apply_rope_fusion + bias_activation_fusion: True + defer_fp32_logits: null + + optimizer: + optimizer: "adam" + lr: 5.0e-6 + min_lr: 4.9999e-6 + weight_decay: 0.1 + bf16: false + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.98 + adam_eps: 1e-5 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: true + use_precision_aware_optimizer: true + + clip_grad: ${policy.max_grad_norm} + + # optimizer cpu offload + optimizer_cpu_offload: false + optimizer_offload_fraction: 0.0 + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: 1000 + lr_warmup_iters: 50 + lr_warmup_init: 4.9999e-6 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + average_in_collective: true + data_parallel_sharding_strategy: "optim_grads_params" + use_custom_fsdp: false + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + add_bos: true + add_eos: true + add_generation_prompt: false + shuffle: false + num_workers: 1 + + dataset_name: "squad" + # You can use custom response datasets for training and validation. For example: + # data: + # dataset_name: ResponseDataset + # train_data_path: # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace) + # val_data_path: + # input_key: , default is "input" + # output_key: , default is "output" + # train_split: , default is None # used for HuggingFace datasets + # val_split: , default is None # used for HuggingFace datasets + # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details. + + ## unused with squad dataset + prompt_file: null + split: null + output_key: null + seed: null + + + ## OpenAI format specific configs + # train_data_path: "/path/to/train.jsonl" # Path to training data + # val_data_path: "/path/to/val.jsonl" # Path to validation data + # chat_key: "messages" # Key for messages in the data + # system_key: null # Key for system message (optional) + # system_prompt: null # Default system prompt (optional) + # tool_key: "tools" # Key for tools in the data + # use_preserving_dataset: false # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures) + +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: true + mlflow_enabled: false + swanlab_enabled: false # Disable SwanLab logging + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "adil-workspace" + entity: "Nemo-automodel" + name: "nemo-rl-sft-gpt-oss-20b-ep-8" + tensorboard: + log_dir: "tb_logs-sft-dev-${data.dataset_name}" + mlflow: + experiment_name: "sft-dev" + run_name: "sft-dev-${data.dataset_name}" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index ce0b58dbef..e075bb05b4 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -48,6 +48,7 @@ get_cpu_state_dict, to_local_if_dtensor, ) +from nemo_automodel.components.config.loader import _resolve_target from torch import nn from torch.distributed.fsdp import ( CPUOffloadPolicy, @@ -171,7 +172,6 @@ def __init__( else: raise ValueError(f"Unknown precision: {self.cfg['precision']}") - print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") self.enable_seq_packing = self.cfg["sequence_packing"]["enabled"] if self.enable_seq_packing: assert not self.is_vlm, ( @@ -248,6 +248,18 @@ def __init__( # All ranks initialize model on meta device, so FSDP can shard it. # The actual weights will be broadcast from rank 0. + automodel_model_kwargs = self.cfg.get("automodel_model_kwargs", {}) + if automodel_model_kwargs.get("backend", None) is not None: + backend_class = _resolve_target( + automodel_model_kwargs.get("backend", None)["_target_"] + ) + backend_kwargs = automodel_model_kwargs.get("backend") + backend_kwargs.pop("_target_") + backend = backend_class( + **backend_kwargs, + ) + automodel_model_kwargs["backend"] = backend + with init_empty_weights(): # NeMoAutoModelForCausalLM uses flash_attention_2 by default # so we need to set it to None if sequence packing is disabled @@ -255,9 +267,8 @@ def __init__( self.model = model_class.from_config( model_config, attn_implementation=attn_impl, - use_liger_kernel=False, - trust_remote_code=True, torch_dtype=str(model_config.torch_dtype), + **automodel_model_kwargs, ) # Hold a copy of model state_dict keys before any parallelization (as in train_ft.py) @@ -269,6 +280,7 @@ def __init__( tp_size = self.cfg["dtensor_cfg"].get("tensor_parallel_size", 1) cp_size = self.cfg["dtensor_cfg"].get("context_parallel_size", 1) ep_size = self.cfg["dtensor_cfg"].get("expert_parallel_size", 1) + dp_size = self.cfg["dtensor_cfg"].get("data_parallel_size", None) if cp_size > 1 and self.enable_seq_packing: raise ValueError( "Context parallel is not supported for sequence packing. Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." @@ -304,7 +316,7 @@ def __init__( # Build device mesh and parallelize # ------------------------------------------------ manager = FSDP2Manager( - dp_size=None, + dp_size=dp_size, dp_replicate_size=1, tp_size=tp_size, cp_size=cp_size, @@ -363,6 +375,7 @@ def __init__( # Load base model weights across all ranks using Automodel Checkpointer # This mirrors build_model_and_optimizer's is_meta_device + load_weights path + print(self.model) self._ensure_checkpointer( config_updates={ "model_repo_id": model_name, @@ -2097,3 +2110,4 @@ def _infer_checkpoint_root(weights_path: str) -> str: if weights_dir.endswith("weights"): return os.path.dirname(weights_dir) return weights_dir + \ No newline at end of file From 2b86310e3d2e05d8309fc25f354c3f33d284da98 Mon Sep 17 00:00:00 2001 From: adil-a Date: Thu, 30 Oct 2025 13:00:17 -0700 Subject: [PATCH 12/32] cfg Signed-off-by: adil-a --- examples/configs/sft_automodel.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/configs/sft_automodel.yaml b/examples/configs/sft_automodel.yaml index 89bc9bb85d..a0024c18cb 100644 --- a/examples/configs/sft_automodel.yaml +++ b/examples/configs/sft_automodel.yaml @@ -22,8 +22,7 @@ checkpointing: checkpoint_must_save_by: null policy: - model_name: "/adasif/models/models--openai--gpt-oss-20b/snapshots/6cee5e81ee83917806bbde320786a8fb61efebee/" - # model_name: "/adasif/models/models--moonshotai--Moonlight-16B-A3B/snapshots/ce8bc137e6e29c3b7540ebdd515bbc5bdb20d915/" + model_name: "openai/gpt-oss-20b" tokenizer: name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default # chat_template can be a Jinja template string or path to a .jinja file @@ -160,7 +159,7 @@ data: add_bos: true add_eos: true add_generation_prompt: false - shuffle: false + shuffle: true num_workers: 1 dataset_name: "squad" @@ -199,9 +198,8 @@ logger: swanlab_enabled: false # Disable SwanLab logging monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: - project: "adil-workspace" - entity: "Nemo-automodel" - name: "nemo-rl-sft-gpt-oss-20b-ep-8" + project: "sft-dev" + name: "sft-dev-${data.dataset_name}" tensorboard: log_dir: "tb_logs-sft-dev-${data.dataset_name}" mlflow: From dd634b6a0312348280409f982e5dc8ba46a04657 Mon Sep 17 00:00:00 2001 From: adil-a Date: Fri, 31 Oct 2025 15:35:36 -0700 Subject: [PATCH 13/32] eof fix Signed-off-by: adil-a --- nemo_rl/models/policy/dtensor_policy_worker_v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index e075bb05b4..2e9ff8095b 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -2110,4 +2110,3 @@ def _infer_checkpoint_root(weights_path: str) -> str: if weights_dir.endswith("weights"): return os.path.dirname(weights_dir) return weights_dir - \ No newline at end of file From 2f74d7934b67cd64e1ea18423f3d4593c5eddf97 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Tue, 4 Nov 2025 03:42:57 +0000 Subject: [PATCH 14/32] feat: automodel moe integration Signed-off-by: Hemil Desai --- examples/configs/sft_automodel.yaml | 19 +++-- .../models/policy/dtensor_policy_worker_v2.py | 75 +++++++++++------ pyproject.toml | 12 ++- uv.lock | 83 ++++++------------- 4 files changed, 96 insertions(+), 93 deletions(-) diff --git a/examples/configs/sft_automodel.yaml b/examples/configs/sft_automodel.yaml index a0024c18cb..f5c5318bd8 100644 --- a/examples/configs/sft_automodel.yaml +++ b/examples/configs/sft_automodel.yaml @@ -38,10 +38,10 @@ policy: use_liger_kernel: false backend: _target_: nemo_automodel.components.moe.utils.BackendConfig - attn: flex + attn: te linear: te rms_norm: te - enable_deepep: false + enable_deepep: true fake_balanced_gate: false enable_hf_state_dict_adapter: true @@ -74,16 +74,19 @@ policy: max_grad_norm: 1.0 optimizer: - name: "torch.optim.AdamW" + #name: "torch.optim.AdamW" + name: "transformer_engine.pytorch.optimizers.fused_adam.FusedAdam" kwargs: lr: 5.0e-6 weight_decay: 0.1 betas: [0.9, 0.98] eps: 1e-5 + store_param_remainders: true + master_weights: true # when using Dtensor, we need to set foreach # and fused to False - foreach: False - fused: False + #foreach: False + #fused: False # ignored since enabled=false, but needed for testing purposes megatron_cfg: @@ -104,7 +107,7 @@ policy: moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing + #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True # gives ~25% training perf speedup with sequence packing and apply_rope_fusion bias_activation_fusion: True @@ -198,8 +201,8 @@ logger: swanlab_enabled: false # Disable SwanLab logging monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: - project: "sft-dev" - name: "sft-dev-${data.dataset_name}" + project: "hemild-rl-automodel" + name: "${policy.model_name}-${data.dataset_name}-attn-${policy.automodel_model_kwargs.backend.attn}-ep${policy.dtensor_cfg.expert_parallel_size}" tensorboard: log_dir: "tb_logs-sft-dev-${data.dataset_name}" mlflow: diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 2e9ff8095b..34da6e9ed9 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -30,25 +30,36 @@ from nemo_automodel._transformers.utils import ( sliding_window_overwrite, ) -from nemo_automodel.components.moe.parallelizer import ( - parallelize_model as moe_parallelize_model, +from nemo_automodel.components.checkpoint._backports.filesystem import ( + SerializationFormat, ) +from nemo_automodel.components.checkpoint.checkpointing import ( + Checkpointer, + _maybe_adapt_state_dict_from_hf, + _maybe_adapt_state_dict_to_hf, +) +from nemo_automodel.components.checkpoint.checkpointing import ( + CheckpointingConfig as AutomodelCheckpointingConfig, +) +from nemo_automodel.components.config.loader import _resolve_target from nemo_automodel.components.distributed.cp_utils import ( create_context_parallel_ctx, get_train_context, ) +from nemo_automodel.components.distributed.fsdp2 import ( + FSDP2Manager, +) from nemo_automodel.components.distributed.grad_utils import ( clip_grad_by_total_norm_, get_grad_norm, ) -from nemo_automodel.components.distributed.fsdp2 import ( - FSDP2Manager, -) from nemo_automodel.components.distributed.tensor_utils import ( get_cpu_state_dict, to_local_if_dtensor, ) -from nemo_automodel.components.config.loader import _resolve_target +from nemo_automodel.components.moe.parallelizer import ( + parallelize_model as moe_parallelize_model, +) from torch import nn from torch.distributed.fsdp import ( CPUOffloadPolicy, @@ -59,8 +70,10 @@ AutoConfig, AutoProcessor, AutoTokenizer, + PreTrainedModel, ) from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM +from transformers.utils import TRANSFORMERS_CACHE from nemo_rl.algorithms.interfaces import LossFunction, LossType from nemo_rl.algorithms.loss_functions import SequencePackingLossWrapper @@ -87,21 +100,10 @@ import_class_from_path, resolve_model_class, ) - from nemo_rl.utils.checkpoint import CheckpointingConfig -from nemo_automodel.components.checkpoint.checkpointing import ( - Checkpointer, - CheckpointingConfig as AutomodelCheckpointingConfig, -) -from nemo_automodel.components.checkpoint._backports.filesystem import ( - SerializationFormat, -) from nemo_rl.utils.nsys import wrap_with_nvtx_name from nemo_rl.utils.packed_tensor import packed_broadcast_producer -from transformers.utils import TRANSFORMERS_CACHE -from transformers import PreTrainedModel - @ray.remote( runtime_env=get_runtime_env_for_policy_worker("dtensor_policy_worker_v2") @@ -424,7 +426,7 @@ def __init__( if init_optimizer: optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"]) self.optimizer = optimizer_cls( - self.model.parameters(), **self.cfg["optimizer"]["kwargs"] + self.model.parameters(), **self.cfg["optimizer"]["kwargs"], exp_avg_dtype=torch.bfloat16, exp_avg_sq_dtype=torch.bfloat16 ) else: self.optimizer = None @@ -704,10 +706,11 @@ def train( ) with get_train_context(False, False, context_parallel_ctx)(): - with torch.autocast(device_type="cuda", dtype=self.dtype): + with nullcontext(): model_args = dict( input_ids=input_ids, attention_mask=attention_mask, + padding_mask=~attention_mask, position_ids=position_ids, use_cache=False, flash_attn_kwargs=flash_attn_kwargs, @@ -831,7 +834,7 @@ def train( # when FSDP reduces the gradients over the DP dim, they're automatically averaged # but we want to sum them so we cancel out the average here - loss *= self.dp_size * self.cp_size + #loss *= self.dp_size * self.cp_size loss.backward() if num_valid_samples > 0: @@ -840,7 +843,22 @@ def train( grad_norm: Optional[float | torch.Tensor] = None if not eval_mode: - with torch.no_grad(): + from nemo_automodel.components.training.utils import scale_grads_and_clip_grad_norm + grad_norm = scale_grads_and_clip_grad_norm( + self.max_grad_norm, + [self.model], + norm_type=2.0, + pp_enabled=False, + device_mesh=self.device_mesh, + moe_mesh=self.moe_mesh, + ep_axis_name="ep" if self.moe_mesh is not None and "ep" in self.moe_mesh.mesh_dim_names else None, + pp_axis_name=None, + foreach=True, + num_label_tokens=1, + dp_group_size=self.dp_size*self.cp_size, + ) + grad_norm = grad_norm.detach().cpu().float() + '''with torch.no_grad(): grad_norm = get_grad_norm( self.model.parameters(), dp_cp_group=self.dp_cp_mesh.get_group(), @@ -853,7 +871,7 @@ def train( max_grad_norm=self.max_grad_norm, total_norm=grad_norm, ) - grad_norm = torch.tensor([grad_norm]) + grad_norm = torch.tensor([grad_norm])''' # Update parameters self.optimizer.step() @@ -1034,7 +1052,7 @@ def get_logprobs( ) with get_train_context(False, False, context_parallel_ctx)(): - with torch.autocast(device_type="cuda", dtype=self.dtype): + with nullcontext(): model_args = dict( input_ids=input_ids, attention_mask=attention_mask, @@ -1054,7 +1072,7 @@ def get_logprobs( outputs = self.model(**model_args) - logits = outputs.logits + logits = outputs.logits if hasattr(outputs, "logits") else outputs # Apply temperature scaling logits = self._apply_temperature_scaling(logits) @@ -1106,6 +1124,7 @@ def get_logprobs( assert token_logprobs.shape[1] == seq_len - 1 else: if isinstance(logits, DTensor): + print(f"{logits.__class__=}") token_logprobs = get_logprobs_from_vocab_parallel_logits( logits, input_ids, @@ -1703,7 +1722,9 @@ def maybe_init_zmq(self): def prepare_refit_info(self) -> Optional[dict[str, Any]]: """Prepare state dict metadata for weight refitting and IPC streaming.""" state_dict_info = {} - for name, tensor in self.model.state_dict().items(): + state_dict = self.model.state_dict() + state_dict = _maybe_adapt_state_dict_to_hf(self.model, state_dict) + for name, tensor in state_dict.items(): # all tensor will be casted to self.dtype in stream_weights_via_ipc_zmq/broadcast_weights_for_collective state_dict_info[name] = (tensor.shape, self.dtype) @@ -1729,7 +1750,9 @@ def stream_weights_via_ipc_zmq(self, buffer_size_bytes: int = 0) -> None: def dtensor_params_generator(): """Generator that yields (name, tensor) pairs, converting DTensors to local tensors.""" - for name, tensor in self.model.state_dict().items(): + state_dict = self.model.state_dict() + state_dict = _maybe_adapt_state_dict_to_hf(self.model, state_dict) + for name, tensor in state_dict.items(): if isinstance(tensor, DTensor): # Convert DTensor to full tensor for streaming full_tensor = tensor.full_tensor() diff --git a/pyproject.toml b/pyproject.toml index 955f12269b..b18ca48e31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,8 @@ automodel = [ "mamba-ssm", "causal-conv1d", "grouped_gemm @ git+https://github.com/fanshiqing/grouped_gemm@v1.1.4", + "transformer-engine[pytorch]==2.8.0", + "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", ] vllm = [ "cuda-python", @@ -72,7 +74,7 @@ vllm = [ # deep_ep also needs libibverbs-dev # sudo apt-get update # sudo apt-get install libibverbs-dev - "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@e3908bf5bd0cc6265bcb225d15cd8c996d4759ef", + "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", "vllm==0.11.0", "num2words>=0.5.14", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved @@ -221,18 +223,26 @@ requires-dist = ["torch", "einops", "setuptools", "psutil", "ninja"] [[tool.uv.dependency-metadata]] name = "causal-conv1d" +# This version has to match the version in the commit/rev/tag used +version = "1.5.0.post8" requires-dist = ["torch", "packaging", "ninja"] [[tool.uv.dependency-metadata]] name = "mamba-ssm" +# This version has to match the version in the commit/rev/tag used +version = "2.2.4" requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"] [[tool.uv.dependency-metadata]] name = "deep_ep" +# This version has to match the version in the commit/rev/tag used +version = "v1.2.1+bfded34" requires-dist = ["torch", "packaging", "ninja"] [[tool.uv.dependency-metadata]] name = "deep_gemm" +# This version has to match the version in the commit/rev/tag used +version = "v2.0.0+7b6b556" requires-dist = ["torch", "packaging", "ninja"] [tool.black] diff --git a/uv.lock b/uv.lock index 24c2d731d8..e2ce06841f 100644 --- a/uv.lock +++ b/uv.lock @@ -31,14 +31,17 @@ overrides = [ [[manifest.dependency-metadata]] name = "causal-conv1d" +version = "1.5.0.post8" requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] name = "deep-ep" +version = "1.2.1+bfded34" requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] name = "deep-gemm" +version = "2.0.0+7b6b556" requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] @@ -47,6 +50,7 @@ requires-dist = ["torch", "einops", "setuptools", "psutil", "ninja"] [[manifest.dependency-metadata]] name = "mamba-ssm" +version = "2.2.4" requires-dist = ["torch", "packaging", "ninja", "causal-conv1d"] [[package]] @@ -1086,13 +1090,25 @@ wheels = [ [[package]] name = "deep-ep" -version = "1.1.0+e3908bf" -source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef#e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" } +version = "1.2.1+bfded34" +source = { git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480#bfded34800dfec415b71503f8205181de90b2480" } +dependencies = [ + { name = "ninja" }, + { name = "packaging" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, +] [[package]] name = "deep-gemm" version = "2.0.0+7b6b556" source = { git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c#7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" } +dependencies = [ + { name = "ninja" }, + { name = "packaging" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, +] [[package]] name = "defusedxml" @@ -2414,15 +2430,11 @@ name = "mamba-ssm" version = "2.2.4" source = { git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4#2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" } dependencies = [ - { name = "einops" }, + { name = "causal-conv1d" }, { name = "ninja" }, { name = "packaging" }, - { name = "setuptools" }, { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "transformers" }, - { name = "triton", version = "3.4.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, - { name = "triton", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, ] [[package]] @@ -3087,40 +3099,6 @@ dependencies = [ ] [package.optional-dependencies] -all = [ - { name = "backoff" }, - { name = "flash-attn" }, - { name = "mistral-common", extra = ["opencv"] }, - { name = "numba" }, - { name = "numpy" }, - { name = "nvidia-nvshmem-cu13" }, - { name = "pillow" }, - { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, - { name = "timm" }, - { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, - { name = "transformers" }, -] -deepep = [ - { name = "nvidia-nvshmem-cu13" }, -] -dev = [ - { name = "backoff" }, - { name = "flash-attn" }, - { name = "mistral-common", extra = ["opencv"] }, - { name = "numba" }, - { name = "numpy" }, - { name = "pillow" }, - { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, - { name = "timm" }, - { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "platform_machine == 'aarch64' and sys_platform == 'linux'" }, - { name = "torchvision", version = "0.23.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, - { name = "torchvision", version = "0.23.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, - { name = "transformers" }, -] fa = [ { name = "flash-attn" }, ] @@ -3182,14 +3160,8 @@ requires-dist = [ { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, { name = "megatron-fsdp" }, { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, - { name = "nemo-automodel", extras = ["deepep"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, - { name = "nemo-automodel", extras = ["fa"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, - { name = "nemo-automodel", extras = ["fa"], marker = "extra == 'dev'", editable = "3rdparty/Automodel-workspace/Automodel" }, - { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, - { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'dev'", editable = "3rdparty/Automodel-workspace/Automodel" }, { name = "numba", marker = "extra == 'vlm'" }, { name = "numpy", marker = "extra == 'vlm'" }, - { name = "nvidia-nvshmem-cu13", marker = "extra == 'deepep'" }, { name = "opencv-python-headless", specifier = "==4.10.0.84" }, { name = "pillow", marker = "extra == 'vlm'" }, { name = "pybind11" }, @@ -3208,7 +3180,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'vlm'", specifier = "<=4.57.1" }, { name = "wandb" }, ] -provides-extras = ["vlm", "fa", "moe", "deepep", "dev", "all"] +provides-extras = ["vlm", "fa", "moe"] [package.metadata.requires-dev] build = [ @@ -3281,10 +3253,12 @@ dependencies = [ [package.optional-dependencies] automodel = [ { name = "causal-conv1d" }, + { name = "deep-ep" }, { name = "flash-attn" }, { name = "grouped-gemm" }, { name = "mamba-ssm" }, { name = "nemo-automodel" }, + { name = "transformer-engine", extra = ["pytorch"] }, { name = "vllm" }, ] mcore = [ @@ -3353,7 +3327,8 @@ requires-dist = [ { name = "cuda-python", marker = "extra == 'vllm'" }, { name = "datasets", specifier = ">=4.0.0" }, { name = "debugpy" }, - { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=e3908bf5bd0cc6265bcb225d15cd8c996d4759ef" }, + { name = "deep-ep", marker = "extra == 'automodel'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" }, + { name = "deep-ep", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepEP.git?rev=bfded34800dfec415b71503f8205181de90b2480" }, { name = "deep-gemm", marker = "extra == 'vllm'", git = "https://github.com/deepseek-ai/DeepGEMM.git?rev=7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" }, { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.8.1" }, { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.8.1" }, @@ -3392,6 +3367,7 @@ requires-dist = [ { name = "torchdata" }, { name = "torchvision", marker = "sys_platform != 'darwin'", specifier = ">=0.22.0", index = "https://download.pytorch.org/whl/cu129" }, { name = "torchvision", marker = "sys_platform == 'darwin'", specifier = ">=0.22.0", index = "https://pypi.org/simple" }, + { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'automodel'", specifier = "==2.8.0" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'mcore'", specifier = "==2.8.0" }, { name = "transformers", specifier = ">=4.55.4" }, { name = "triton", marker = "sys_platform != 'darwin'", index = "https://download.pytorch.org/whl/cu129" }, @@ -3762,15 +3738,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/49/7e1e3e98f5b8ae79f21260f9a90d8d985e5ad67b69b90b09456fc3c01a18/nvidia_nvshmem_cu12-3.3.24-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0032831c0ec4fdc64c3bd8daeae588f6647ee4afc3376c5871218546acac0e81", size = 139158697, upload-time = "2025-08-22T19:56:39.552Z" }, ] -[[package]] -name = "nvidia-nvshmem-cu13" -version = "3.4.5" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/dc/0f/05cc9c720236dcd2db9c1ab97fff629e96821be2e63103569da0c9b72f19/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dc2a197f38e5d0376ad52cd1a2a3617d3cdc150fd5966f4aee9bcebb1d68fe9", size = 60215947, upload-time = "2025-09-06T00:32:20.022Z" }, - { url = "https://files.pythonhosted.org/packages/3c/35/a9bf80a609e74e3b000fef598933235c908fcefcef9026042b8e6dfde2a9/nvidia_nvshmem_cu13-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:290f0a2ee94c9f3687a02502f3b9299a9f9fe826e6d0287ee18482e78d495b80", size = 60412546, upload-time = "2025-09-06T00:32:41.564Z" }, -] - [[package]] name = "nvidia-nvtx-cu12" version = "12.9.79" From 1163407a704e0d72a3eb0ced262ee323e844f1a6 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 4 Nov 2025 20:41:17 -0800 Subject: [PATCH 15/32] bump Signed-off-by: adil-a --- 3rdparty/Automodel-workspace/Automodel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index 3e6952d5a1..5e995e9535 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit 3e6952d5a104c6ff36f0ee2cd539aeac022d6da7 +Subproject commit 5e995e9535e63cbe3358dc2bd81a8ed3a696cee7 From d270a5beaec120482fe9769e34d6ad966f2a6de2 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 22:36:48 -0800 Subject: [PATCH 16/32] adding torch arch list for grouped gemm isntall Signed-off-by: adil-a --- nemo_rl/utils/venvs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nemo_rl/utils/venvs.py b/nemo_rl/utils/venvs.py index 117a54409d..b40b9debef 100644 --- a/nemo_rl/utils/venvs.py +++ b/nemo_rl/utils/venvs.py @@ -14,6 +14,7 @@ import logging import os import shlex +import shutil import subprocess import time from functools import lru_cache @@ -71,8 +72,6 @@ def create_local_venv( # Force rebuild if requested if force_rebuild and os.path.exists(venv_path): logger.info(f"Force rebuilding venv at {venv_path}") - import shutil - shutil.rmtree(venv_path) logger.info(f"Creating new venv at {venv_path}") @@ -89,6 +88,10 @@ def create_local_venv( # https://docs.astral.sh/uv/concepts/projects/config/#project-environment-path env["UV_PROJECT_ENVIRONMENT"] = venv_path + # Set TORCH_CUDA_ARCH_LIST for grouped_gemm & DeepEP installation. Hopper+ architectures are supported. + if "TORCH_CUDA_ARCH_LIST" not in env: + env["TORCH_CUDA_ARCH_LIST"] = "9.0 10.0 12.0" + # Split the py_executable into command and arguments exec_cmd = shlex.split(py_executable) # Command doesn't matter, since `uv` syncs the environment no matter the command. From d038acaa0178481e2133e2af3eb97ebac11a37b4 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 22:38:33 -0800 Subject: [PATCH 17/32] linting Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 34da6e9ed9..f989d20b92 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -35,7 +35,6 @@ ) from nemo_automodel.components.checkpoint.checkpointing import ( Checkpointer, - _maybe_adapt_state_dict_from_hf, _maybe_adapt_state_dict_to_hf, ) from nemo_automodel.components.checkpoint.checkpointing import ( @@ -49,10 +48,6 @@ from nemo_automodel.components.distributed.fsdp2 import ( FSDP2Manager, ) -from nemo_automodel.components.distributed.grad_utils import ( - clip_grad_by_total_norm_, - get_grad_norm, -) from nemo_automodel.components.distributed.tensor_utils import ( get_cpu_state_dict, to_local_if_dtensor, @@ -426,7 +421,10 @@ def __init__( if init_optimizer: optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"]) self.optimizer = optimizer_cls( - self.model.parameters(), **self.cfg["optimizer"]["kwargs"], exp_avg_dtype=torch.bfloat16, exp_avg_sq_dtype=torch.bfloat16 + self.model.parameters(), + **self.cfg["optimizer"]["kwargs"], + exp_avg_dtype=torch.bfloat16, + exp_avg_sq_dtype=torch.bfloat16, ) else: self.optimizer = None @@ -834,7 +832,7 @@ def train( # when FSDP reduces the gradients over the DP dim, they're automatically averaged # but we want to sum them so we cancel out the average here - #loss *= self.dp_size * self.cp_size + # loss *= self.dp_size * self.cp_size loss.backward() if num_valid_samples > 0: @@ -843,22 +841,28 @@ def train( grad_norm: Optional[float | torch.Tensor] = None if not eval_mode: - from nemo_automodel.components.training.utils import scale_grads_and_clip_grad_norm + from nemo_automodel.components.training.utils import ( + scale_grads_and_clip_grad_norm, + ) + grad_norm = scale_grads_and_clip_grad_norm( - self.max_grad_norm, - [self.model], - norm_type=2.0, - pp_enabled=False, - device_mesh=self.device_mesh, - moe_mesh=self.moe_mesh, - ep_axis_name="ep" if self.moe_mesh is not None and "ep" in self.moe_mesh.mesh_dim_names else None, - pp_axis_name=None, - foreach=True, - num_label_tokens=1, - dp_group_size=self.dp_size*self.cp_size, - ) + self.max_grad_norm, + [self.model], + norm_type=2.0, + pp_enabled=False, + device_mesh=self.device_mesh, + moe_mesh=self.moe_mesh, + ep_axis_name="ep" + if self.moe_mesh is not None + and "ep" in self.moe_mesh.mesh_dim_names + else None, + pp_axis_name=None, + foreach=True, + num_label_tokens=1, + dp_group_size=self.dp_size * self.cp_size, + ) grad_norm = grad_norm.detach().cpu().float() - '''with torch.no_grad(): + """with torch.no_grad(): grad_norm = get_grad_norm( self.model.parameters(), dp_cp_group=self.dp_cp_mesh.get_group(), @@ -871,7 +875,7 @@ def train( max_grad_norm=self.max_grad_norm, total_norm=grad_norm, ) - grad_norm = torch.tensor([grad_norm])''' + grad_norm = torch.tensor([grad_norm])""" # Update parameters self.optimizer.step() From 7df0cc5e6e6c923b3b629ebf40a52d2c5ecd39c0 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 22:51:45 -0800 Subject: [PATCH 18/32] uv lock Signed-off-by: adil-a --- uv.lock | 4 ---- 1 file changed, 4 deletions(-) diff --git a/uv.lock b/uv.lock index 9d042bd886..429097526f 100644 --- a/uv.lock +++ b/uv.lock @@ -36,11 +36,7 @@ requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] name = "deep-ep" -<<<<<<< HEAD version = "1.2.1+bfded34" -======= -version = "1.1.0+e3908bf" ->>>>>>> origin/main requires-dist = ["torch", "packaging", "ninja"] [[manifest.dependency-metadata]] From b4139f10c6e8ba255da2a78b3a6266bbae8d9b02 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 22:57:30 -0800 Subject: [PATCH 19/32] fix Signed-off-by: adil-a --- .../llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml} | 8 ++++---- pyrefly.toml | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) rename examples/configs/{sft_automodel.yaml => recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml} (97%) diff --git a/examples/configs/sft_automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml similarity index 97% rename from examples/configs/sft_automodel.yaml rename to examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index f5c5318bd8..e2362d5b7c 100644 --- a/examples/configs/sft_automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -53,8 +53,8 @@ policy: activation_checkpointing: false tensor_parallel_size: 1 context_parallel_size: 1 - expert_parallel_size: 8 - data_parallel_size: 8 + expert_parallel_size: 1 + data_parallel_size: 1 custom_parallel_plan: null dynamic_batching: @@ -195,7 +195,7 @@ data: logger: log_dir: "logs" # Base directory for all logs - wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running tensorboard_enabled: true mlflow_enabled: false swanlab_enabled: false # Disable SwanLab logging @@ -213,5 +213,5 @@ logger: flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) cluster: - gpus_per_node: 8 + gpus_per_node: 1 num_nodes: 1 diff --git a/pyrefly.toml b/pyrefly.toml index 1c0013ff22..4083362cb0 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -107,7 +107,6 @@ project-includes = [ "nemo_rl/utils/checkpoint.py", "nemo_rl/utils/config.py", "nemo_rl/utils/native_checkpoint.py", - "nemo_rl/utils/automodel_checkpoint.py", "nemo_rl/utils/nsys.py", "nemo_rl/utils/nvml.py", "nemo_rl/utils/packed_tensor.py", From a55a2f19e68b09e9d5879c6e5e473391715f0f15 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 22:59:50 -0800 Subject: [PATCH 20/32] wandb yaml fix Signed-off-by: adil-a --- .../recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index e2362d5b7c..57893b0f2e 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -201,7 +201,7 @@ logger: swanlab_enabled: false # Disable SwanLab logging monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: - project: "hemild-rl-automodel" + project: "sft-dev" name: "${policy.model_name}-${data.dataset_name}-attn-${policy.automodel_model_kwargs.backend.attn}-ep${policy.dtensor_cfg.expert_parallel_size}" tensorboard: log_dir: "tb_logs-sft-dev-${data.dataset_name}" From 39bd74cf509250698d451ef1c7995cd061df85ff Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 4 Nov 2025 23:06:30 -0800 Subject: [PATCH 21/32] minimizing yaml Signed-off-by: adil-a --- ...t-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 201 +----------------- 1 file changed, 5 insertions(+), 196 deletions(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index 57893b0f2e..10f7a8af3b 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -1,39 +1,10 @@ -# SFT Algorithm Configuration -sft: - ## total number of steps to train will equal - ## min((max_num_epochs * len(train_dataloader)), max_num_steps) - max_num_epochs: 1 - max_num_steps: 60 - - val_period: 10 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 - -checkpointing: - enabled: false - checkpoint_dir: "results/sft" - metric_name: "val_loss" ## set to null to save most recent k checkpoints - higher_is_better: false - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null - +defaults: ../../sft.yaml policy: - model_name: "openai/gpt-oss-20b" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - # chat_template can be a Jinja template string or path to a .jinja file - chat_template: "{% for message in messages %}{%- if message['role'] == 'system' %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user' %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant' %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}" - chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true + model_name: openai/gpt-oss-20b train_global_batch_size: 128 train_micro_batch_size: 8 max_total_sequence_length: 512 - precision: "bfloat16" dequantize_base_checkpoint: true - automodel_model_kwargs: use_liger_kernel: false backend: @@ -44,174 +15,12 @@ policy: enable_deepep: true fake_balanced_gate: false enable_hf_state_dict_adapter: true - dtensor_cfg: - enabled: true _v2: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - expert_parallel_size: 1 - data_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - sequence_length_round: 64 - - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - + expert_parallel_size: 8 + data_parallel_size: 8 optimizer: - #name: "torch.optim.AdamW" - name: "transformer_engine.pytorch.optimizers.fused_adam.FusedAdam" + name: transformer_engine.pytorch.optimizers.fused_adam.FusedAdam kwargs: - lr: 5.0e-6 - weight_decay: 0.1 - betas: [0.9, 0.98] - eps: 1e-5 store_param_remainders: true master_weights: true - # when using Dtensor, we need to set foreach - # and fused to False - #foreach: False - #fused: False - - # ignored since enabled=false, but needed for testing purposes - megatron_cfg: - enabled: false - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - # gives ~25% training perf speedup with sequence packing and apply_rope_fusion - bias_activation_fusion: True - defer_fp32_logits: null - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 4.9999e-6 - weight_decay: 0.1 - bf16: false - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - # optimizer cpu offload - optimizer_cpu_offload: false - optimizer_offload_fraction: 0.0 - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 - lr_warmup_init: 4.9999e-6 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - use_custom_fsdp: false - -data: - max_input_seq_length: ${policy.max_total_sequence_length} - add_bos: true - add_eos: true - add_generation_prompt: false - shuffle: true - num_workers: 1 - - dataset_name: "squad" - # You can use custom response datasets for training and validation. For example: - # data: - # dataset_name: ResponseDataset - # train_data_path: # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace) - # val_data_path: - # input_key: , default is "input" - # output_key: , default is "output" - # train_split: , default is None # used for HuggingFace datasets - # val_split: , default is None # used for HuggingFace datasets - # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details. - - ## unused with squad dataset - prompt_file: null - split: null - output_key: null - seed: null - - - ## OpenAI format specific configs - # train_data_path: "/path/to/train.jsonl" # Path to training data - # val_data_path: "/path/to/val.jsonl" # Path to validation data - # chat_key: "messages" # Key for messages in the data - # system_key: null # Key for system message (optional) - # system_prompt: null # Default system prompt (optional) - # tool_key: "tools" # Key for tools in the data - # use_preserving_dataset: false # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures) - -logger: - log_dir: "logs" # Base directory for all logs - wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running - tensorboard_enabled: true - mlflow_enabled: false - swanlab_enabled: false # Disable SwanLab logging - monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "sft-dev" - name: "${policy.model_name}-${data.dataset_name}-attn-${policy.automodel_model_kwargs.backend.attn}-ep${policy.dtensor_cfg.expert_parallel_size}" - tensorboard: - log_dir: "tb_logs-sft-dev-${data.dataset_name}" - mlflow: - experiment_name: "sft-dev" - run_name: "sft-dev-${data.dataset_name}" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - -cluster: - gpus_per_node: 1 - num_nodes: 1 From 4e151cbd4e6a3b849c9ba2bdd30159682e8a60b6 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 5 Nov 2025 00:29:37 -0800 Subject: [PATCH 22/32] clean up Signed-off-by: adil-a --- ...t-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 2 + .../models/policy/dtensor_policy_worker_v2.py | 60 +++++++++++++------ 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index 10f7a8af3b..71bb743013 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -24,3 +24,5 @@ policy: kwargs: store_param_remainders: true master_weights: true + exp_avg_dtype: torch.bfloat16 + exp_avg_sq_dtype: torch.bfloat16 diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index f989d20b92..231dfda4f9 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -18,7 +18,8 @@ import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext -from typing import Any, Generator, Optional, cast +from typing import Any, Callable, Generator, Optional, cast +import inspect import ray import torch @@ -256,6 +257,7 @@ def __init__( **backend_kwargs, ) automodel_model_kwargs["backend"] = backend + automodel_model_kwargs["use_liger_kernel"] = False with init_empty_weights(): # NeMoAutoModelForCausalLM uses flash_attention_2 by default @@ -420,11 +422,12 @@ def __init__( if init_optimizer: optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"]) + optimizer_kwargs = _resolve_kwargs( + optimizer_cls, self.cfg["optimizer"]["kwargs"] + ) self.optimizer = optimizer_cls( self.model.parameters(), - **self.cfg["optimizer"]["kwargs"], - exp_avg_dtype=torch.bfloat16, - exp_avg_sq_dtype=torch.bfloat16, + **optimizer_kwargs, ) else: self.optimizer = None @@ -862,20 +865,6 @@ def train( dp_group_size=self.dp_size * self.cp_size, ) grad_norm = grad_norm.detach().cpu().float() - """with torch.no_grad(): - grad_norm = get_grad_norm( - self.model.parameters(), - dp_cp_group=self.dp_cp_mesh.get_group(), - tp_group=self.tp_mesh.get_group(), - dtype=torch.float32, - ) - if self.max_grad_norm is not None: - clip_grad_by_total_norm_( - self.model.parameters(), - max_grad_norm=self.max_grad_norm, - total_norm=grad_norm, - ) - grad_norm = torch.tensor([grad_norm])""" # Update parameters self.optimizer.step() @@ -1128,7 +1117,6 @@ def get_logprobs( assert token_logprobs.shape[1] == seq_len - 1 else: if isinstance(logits, DTensor): - print(f"{logits.__class__=}") token_logprobs = get_logprobs_from_vocab_parallel_logits( logits, input_ids, @@ -2137,3 +2125,37 @@ def _infer_checkpoint_root(weights_path: str) -> str: if weights_dir.endswith("weights"): return os.path.dirname(weights_dir) return weights_dir + + +def _resolve_kwargs(callable: Callable, kwargs: dict[str, Any]) -> dict[str, Any]: + """Resolve kwargs for a callable. + + Args: + callable: The callable to resolve kwargs for + kwargs: The kwargs to resolve + + Returns: + The resolved kwargs + """ + + def _resolve_import_class(name: str) -> Any | None: + try: + return import_class_from_path(name) + except Exception: + return + + signature = ( + inspect.signature(callable) + if inspect.isfunction(callable) + else inspect.signature(callable.__init__) + ) + result = {} + for k, v in kwargs.items(): + if k in signature.parameters: + _maybe_resolved_value = ( + _resolve_import_class(v) if isinstance(v, str) else v + ) + result[k] = ( + _maybe_resolved_value if _maybe_resolved_value is not None else v + ) + return result From 4b6ce6d753ad897a56b7bcdebafcf5a4cbcaf1a5 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 5 Nov 2025 10:01:22 -0800 Subject: [PATCH 23/32] dtype map Signed-off-by: adil-a --- ...t-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 4 ++-- .../models/policy/dtensor_policy_worker_v2.py | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index 71bb743013..26f231221f 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -24,5 +24,5 @@ policy: kwargs: store_param_remainders: true master_weights: true - exp_avg_dtype: torch.bfloat16 - exp_avg_sq_dtype: torch.bfloat16 + exp_avg_dtype: bfloat16 + exp_avg_sq_dtype: bfloat16 diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 231dfda4f9..d7a0b56d51 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -101,6 +101,13 @@ from nemo_rl.utils.packed_tensor import packed_broadcast_producer +STRING_TO_DTYPE = { + "float32": torch.float32, + "bfloat16": torch.bfloat16, + "float16": torch.float16, +} + + @ray.remote( runtime_env=get_runtime_env_for_policy_worker("dtensor_policy_worker_v2") ) # pragma: no cover @@ -161,13 +168,9 @@ def __init__( self.cpu_offload = self.cfg["dtensor_cfg"]["cpu_offload"] self.max_grad_norm = self.cfg["max_grad_norm"] - if self.cfg["precision"] == "float32": - self.dtype = torch.float32 - elif self.cfg["precision"] == "bfloat16": - self.dtype = torch.bfloat16 - elif self.cfg["precision"] == "float16": - self.dtype = torch.float16 - else: + try: + self.dtype = STRING_TO_DTYPE[self.cfg["precision"]] + except KeyError: raise ValueError(f"Unknown precision: {self.cfg['precision']}") self.enable_seq_packing = self.cfg["sequence_packing"]["enabled"] @@ -2140,6 +2143,8 @@ def _resolve_kwargs(callable: Callable, kwargs: dict[str, Any]) -> dict[str, Any def _resolve_import_class(name: str) -> Any | None: try: + if name in STRING_TO_DTYPE: + return STRING_TO_DTYPE[name] return import_class_from_path(name) except Exception: return From ef2f92c77d9082fa874a0cbf1d290e4e13d07d75 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 5 Nov 2025 10:12:26 -0800 Subject: [PATCH 24/32] lint Signed-off-by: adil-a --- nemo_rl/models/policy/dtensor_policy_worker_v2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index d7a0b56d51..425a20a644 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -13,13 +13,13 @@ # limitations under the License. import gc +import inspect import itertools import os import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext from typing import Any, Callable, Generator, Optional, cast -import inspect import ray import torch @@ -100,7 +100,6 @@ from nemo_rl.utils.nsys import wrap_with_nvtx_name from nemo_rl.utils.packed_tensor import packed_broadcast_producer - STRING_TO_DTYPE = { "float32": torch.float32, "bfloat16": torch.bfloat16, From 1eef903065598f9d4805715ed24020f47fc0eb27 Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 5 Nov 2025 11:34:09 -0800 Subject: [PATCH 25/32] removing unit test Signed-off-by: adil-a --- tests/unit/utils/test_automodel_checkpoint.py | 420 ------------------ 1 file changed, 420 deletions(-) delete mode 100644 tests/unit/utils/test_automodel_checkpoint.py diff --git a/tests/unit/utils/test_automodel_checkpoint.py b/tests/unit/utils/test_automodel_checkpoint.py deleted file mode 100644 index 9906a1522f..0000000000 --- a/tests/unit/utils/test_automodel_checkpoint.py +++ /dev/null @@ -1,420 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from tempfile import TemporaryDirectory -from unittest.mock import MagicMock, patch - -import pytest -import torch - -# Skip entire module if nemo_automodel is not available -pytest_plugins = [] -try: - import nemo_automodel # noqa: F401 -except ImportError: - pytest.skip("nemo_automodel not available", allow_module_level=True) - -from nemo_rl.utils.automodel_checkpoint import ( - detect_checkpoint_format, - load_checkpoint, - save_checkpoint, -) - - -class TestModel(torch.nn.Module): - """Simple test model with a forward method.""" - - def __init__(self): - super().__init__() - self.layers = torch.nn.ModuleList( - [ - torch.nn.Linear(4, 4), - torch.nn.LayerNorm(4), - torch.nn.ReLU(), - torch.nn.Linear(4, 1), - ] - ) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - return x - - -@pytest.fixture -def mock_model(): - """Create a simple mock model for testing.""" - return TestModel() - - -@pytest.fixture -def mock_optimizer(): - """Create a simple mock optimizer for testing.""" - model = torch.nn.Linear(4, 1) - return torch.optim.Adam(model.parameters()) - - -@pytest.mark.automodel -class TestDetectCheckpointFormat: - """Test the detect_checkpoint_format function.""" - - def test_directory_with_safetensors(self): - """Test detection for directories containing safetensors files.""" - with TemporaryDirectory() as tmp_dir: - # Create directory with safetensors files - os.makedirs(os.path.join(tmp_dir, "weights", "model")) - weights_path = os.path.join(tmp_dir, "weights", "model") - - # Create safetensors shard files - with open( - os.path.join( - weights_path, "shard-00001-model-00001-of-00001.safetensors" - ), - "w", - ) as f: - f.write("dummy content") - with open( - os.path.join( - weights_path, "shard-00002-model-00001-of-00001.safetensors" - ), - "w", - ) as f: - f.write("dummy content") - - format_type, is_peft = detect_checkpoint_format(weights_path) - assert format_type == "safetensors" - assert is_peft == False - - def test_directory_with_dcp_format(self): - """Test detection for directories with DCP (Distributed Checkpoint) format.""" - with TemporaryDirectory() as tmp_dir: - # Create directory structure like: step_3/policy/optimizer/optim - optim_path = os.path.join(tmp_dir, "step_3", "policy", "optimizer", "optim") - os.makedirs(optim_path) - - # Create DCP files (.distcp + .metadata) - with open(os.path.join(optim_path, "__0_0.distcp"), "w") as f: - f.write("dummy dcp content") - with open(os.path.join(optim_path, "__1_0.distcp"), "w") as f: - f.write("dummy dcp content") - with open(os.path.join(optim_path, ".metadata"), "w") as f: - f.write("dummy metadata") - - format_type, is_peft = detect_checkpoint_format(optim_path) - assert format_type == "torch_save" # DCP uses torch_save format - assert is_peft == False - - def test_directory_with_torch_files(self): - """Test detection for directories containing torch save files.""" - with TemporaryDirectory() as tmp_dir: - model_path = os.path.join(tmp_dir, "model") - os.makedirs(model_path) - - # Create torch save files - with open(os.path.join(model_path, "pytorch_model.bin"), "w") as f: - f.write("dummy content") - - format_type, is_peft = detect_checkpoint_format(model_path) - assert format_type == "torch_save" - assert is_peft == False - - def test_peft_detection_in_filenames(self): - """Test PEFT detection from filenames within directories.""" - with TemporaryDirectory() as tmp_dir: - model_path = os.path.join(tmp_dir, "regular_model") - os.makedirs(model_path) - - # Create file with adapter pattern in name - with open(os.path.join(model_path, "adapter_model.safetensors"), "w") as f: - f.write("dummy content") - - format_type, is_peft = detect_checkpoint_format(model_path) - assert format_type == "safetensors" - assert is_peft == True # Should detect adapter in filename - - def test_default_fallback(self): - """Test default behavior for non-existent directories.""" - # Non-existent directory should default to safetensors, no PEFT - format_type, is_peft = detect_checkpoint_format("/non/existent/directory") - assert format_type == "safetensors" - assert is_peft == False - - def test_expected_structure(self): - """Test with the expected folder structure from the user.""" - with TemporaryDirectory() as tmp_dir: - # Create the expected structure: step_3/policy/weights/model - weights_path = os.path.join(tmp_dir, "step_3", "policy", "weights", "model") - os.makedirs(weights_path) - - # Create safetensors shard files as in the example - with open( - os.path.join( - weights_path, "shard-00001-model-00001-of-00001.safetensors" - ), - "w", - ) as f: - f.write("dummy content") - with open( - os.path.join( - weights_path, "shard-00002-model-00001-of-00001.safetensors" - ), - "w", - ) as f: - f.write("dummy content") - - format_type, is_peft = detect_checkpoint_format(weights_path) - assert format_type == "safetensors" - assert is_peft == False - - """Test the save_checkpoint function.""" - - @pytest.mark.automodel - @patch("nemo_rl.utils.automodel_checkpoint.save_model") - @patch("nemo_rl.utils.automodel_checkpoint.save_optimizer") - def test_save_model_only(self, mock_save_optimizer, mock_save_model, mock_model): - """Test saving model weights only.""" - with TemporaryDirectory() as tmp_dir: - weights_path = os.path.join(tmp_dir, "weights") - os.makedirs(os.path.dirname(weights_path), exist_ok=True) - - # Save checkpoint - save_checkpoint( - model=mock_model, - weights_path=weights_path, - model_save_format="safetensors", - is_peft=False, - ) - - # Verify save_model was called correctly - mock_save_model.assert_called_once() - call_args = mock_save_model.call_args - assert call_args[1]["model"] is mock_model - assert call_args[1]["weights_path"] == weights_path - assert ( - call_args[1]["checkpoint_config"].model_save_format.value - == "safetensors" - ) - assert call_args[1]["checkpoint_config"].is_peft == False - - # Verify optimizer saving was not called - mock_save_optimizer.assert_not_called() - - @pytest.mark.automodel - @patch("nemo_rl.utils.automodel_checkpoint.save_model") - @patch("nemo_rl.utils.automodel_checkpoint.save_optimizer") - def test_save_with_optimizer( - self, mock_save_optimizer, mock_save_model, mock_model, mock_optimizer - ): - """Test saving model and optimizer weights.""" - with TemporaryDirectory() as tmp_dir: - weights_path = os.path.join(tmp_dir, "model", "weights") - optimizer_path = os.path.join(tmp_dir, "optimizer", "optim") - os.makedirs(os.path.dirname(weights_path)) - os.makedirs(os.path.dirname(optimizer_path)) - - # Save checkpoint with optimizer - save_checkpoint( - model=mock_model, - weights_path=weights_path, - optimizer=mock_optimizer, - optimizer_path=optimizer_path, - model_save_format="torch_save", - is_peft=True, - ) - - # Verify both model and optimizer saving were called - mock_save_model.assert_called_once() - mock_save_optimizer.assert_called_once() - - # Check optimizer call args - opt_call_args = mock_save_optimizer.call_args - assert opt_call_args[1]["optimizer"] is mock_optimizer - assert opt_call_args[1]["model"] is mock_model - assert opt_call_args[1]["weights_path"] == optimizer_path - - @pytest.mark.automodel - @patch("nemo_rl.utils.automodel_checkpoint.save_model") - def test_save_with_tokenizer(self, mock_save_model, mock_model): - """Test saving with tokenizer.""" - with TemporaryDirectory() as tmp_dir: - weights_path = os.path.join(tmp_dir, "model", "weights") - tokenizer_path = os.path.join(tmp_dir, "tokenizer") - os.makedirs(os.path.dirname(weights_path)) - os.makedirs(tokenizer_path) - - # Create mock tokenizer - mock_tokenizer = MagicMock() - - # Save checkpoint with tokenizer - save_checkpoint( - model=mock_model, - weights_path=weights_path, - tokenizer=mock_tokenizer, - tokenizer_path=tokenizer_path, - ) - - # Verify tokenizer.save_pretrained was called - mock_tokenizer.save_pretrained.assert_called_once_with(tokenizer_path) - - -@pytest.fixture -def mock_experiment(): - """Create a real model, optimizer, and scheduler for integration testing.""" - model = TestModel() - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) - return model, optimizer, scheduler - - -def check_dict_equality(dict1, dict2): - """Recursively check equality of two dictionaries""" - for k in dict1.keys(): - if isinstance(dict1[k], dict): - check_dict_equality(dict1[k], dict2[k]) - elif isinstance(dict1[k], torch.Tensor): - assert torch.allclose(dict1[k], dict2[k]) - else: - assert dict1[k] == dict2[k] - - -@pytest.mark.automodel -class TestSaveLoadIntegration: - """Integration tests that actually save and load checkpoints.""" - - def test_save_and_load_model_only_safetensors(self, mock_experiment): - """Test saving and loading model weights only with safetensors format.""" - test_model, _, _ = mock_experiment - original_state_dict = test_model.state_dict() - - with TemporaryDirectory() as tmp_dir: - weights_path = os.path.join(tmp_dir, "test_model") - - # Save checkpoint - save_checkpoint( - model=test_model, - weights_path=weights_path, - model_save_format="safetensors", - ) - - # Verify files are created - assert os.path.exists(weights_path) - files = os.listdir(os.path.join(weights_path, "model")) - assert any(f.endswith(".safetensors") for f in files) - - # Create a new model with different weights - new_model = TestModel() - # Initialize with different values - for param in new_model.parameters(): - param.data.fill_(999.0) - - # Load the checkpoint - load_checkpoint(model=new_model, weights_path=weights_path) - - # Verify the weights match the original - check_dict_equality(new_model.state_dict(), original_state_dict) - - def test_save_and_load_model_only_torch_save(self, mock_experiment): - """Test saving and loading model weights only with torch_save format.""" - test_model, _, _ = mock_experiment - original_state_dict = test_model.state_dict() - - with TemporaryDirectory() as tmp_dir: - weights_path = os.path.join(tmp_dir, "test_model") - - # Save checkpoint - save_checkpoint( - model=test_model, - weights_path=weights_path, - model_save_format="torch_save", - ) - - # Verify files are created - assert os.path.exists(weights_path) - files = os.listdir(os.path.join(weights_path, "model")) - assert any(f.endswith(".distcp") for f in files) - - # Create a new model with different weights - new_model = TestModel() - # Initialize with different values - for param in new_model.parameters(): - param.data.fill_(999.0) - - # Load the checkpoint - load_checkpoint(model=new_model, weights_path=weights_path) - - # Verify the weights match the original - check_dict_equality(new_model.state_dict(), original_state_dict) - - def test_save_and_load_model_and_optimizer(self, mock_experiment): - """Test saving and loading both model and optimizer.""" - test_model, optimizer, scheduler = mock_experiment - - # Take some optimization steps to change optimizer state - for _ in range(5): - loss = torch.nn.functional.mse_loss( - test_model(torch.randn(2, 4)), torch.randn(2, 1) - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - scheduler.step() - - original_model_state = test_model.state_dict() - original_optimizer_state = optimizer.state_dict() - original_scheduler_state = scheduler.state_dict() - - with TemporaryDirectory() as tmp_dir: - model_path = os.path.join(tmp_dir, "model_and_optimizer", "model_path") - optimizer_path = os.path.join(tmp_dir, "model_and_optimizer", "optimizer") - os.makedirs(os.path.dirname(model_path), exist_ok=True) - os.makedirs(os.path.dirname(optimizer_path), exist_ok=True) - - # Save checkpoint - save_checkpoint( - model=test_model, - weights_path=model_path, - optimizer=optimizer, - scheduler=scheduler, - optimizer_path=optimizer_path, - ) - - # Verify files are created - assert os.path.exists(model_path) - assert os.path.exists(optimizer_path) - - # Create new model, optimizer, and scheduler with different state - new_model = TestModel() - new_optimizer = torch.optim.Adam(new_model.parameters(), lr=0.001) - new_scheduler = torch.optim.lr_scheduler.StepLR( - new_optimizer, step_size=4, gamma=0.2 - ) - - # Initialize with different values - for param in new_model.parameters(): - param.data.fill_(999.0) - - # Load the checkpoint - load_checkpoint( - model=new_model, - weights_path=model_path, - optimizer=new_optimizer, - scheduler=new_scheduler, - optimizer_path=optimizer_path, - ) - - # Verify all states match the original - check_dict_equality(new_model.state_dict(), original_model_state) - check_dict_equality(new_optimizer.state_dict(), original_optimizer_state) - assert new_scheduler.state_dict() == original_scheduler_state From 24214e93a6baadcf77d47e35d1c30089d980bab5 Mon Sep 17 00:00:00 2001 From: Adil Asif Date: Wed, 5 Nov 2025 23:29:43 -0800 Subject: [PATCH 26/32] adding fixes from unit tests Signed-off-by: Adil Asif --- .../sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 1 - nemo_rl/models/policy/dtensor_policy_worker_v2.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index 26f231221f..1910cb665f 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -6,7 +6,6 @@ policy: max_total_sequence_length: 512 dequantize_base_checkpoint: true automodel_model_kwargs: - use_liger_kernel: false backend: _target_: nemo_automodel.components.moe.utils.BackendConfig attn: te diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 425a20a644..90ee63aaa4 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -153,8 +153,10 @@ def __init__( configure_dynamo_cache() self.cfg = config + self.cpu_offload = self.cfg["dtensor_cfg"]["cpu_offload"] # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call - torch.distributed.init_process_group(backend="nccl") + backend = "nccl" if not self.cpu_offload else "cuda:nccl,cpu:gloo" + torch.distributed.init_process_group(backend=backend) self.rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() model_name = self.cfg["model_name"] @@ -164,7 +166,6 @@ def __init__( self.checkpointer = None self.checkpoint_config = None - self.cpu_offload = self.cfg["dtensor_cfg"]["cpu_offload"] self.max_grad_norm = self.cfg["max_grad_norm"] try: @@ -259,16 +260,18 @@ def __init__( **backend_kwargs, ) automodel_model_kwargs["backend"] = backend - automodel_model_kwargs["use_liger_kernel"] = False with init_empty_weights(): # NeMoAutoModelForCausalLM uses flash_attention_2 by default # so we need to set it to None if sequence packing is disabled # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180 - self.model = model_class.from_config( - model_config, + self.model = model_class.from_pretrained( + model_name, attn_implementation=attn_impl, torch_dtype=str(model_config.torch_dtype), + trust_remote_code=True, + config=model_config, + use_liger_kernel=False, **automodel_model_kwargs, ) @@ -338,6 +341,7 @@ def __init__( activation_checkpointing=self.cfg["dtensor_cfg"][ "activation_checkpointing" ], + custom_tp_plan=self.cfg["dtensor_cfg"].get("custom_parallel_plan", None), ) # Store mesh references for downstream usage From 5489b21bfce2c24b86302775c724b58e94524c39 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 25 Nov 2025 15:42:55 -0800 Subject: [PATCH 27/32] bumping automodel + v2 fixes Signed-off-by: adil-a --- 3rdparty/Automodel-workspace/Automodel | 2 +- .../models/policy/dtensor_policy_worker_v2.py | 4 +- uv.lock | 331 ++++++++++++++++-- 3 files changed, 302 insertions(+), 35 deletions(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index 5e995e9535..d7f248adf3 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit 5e995e9535e63cbe3358dc2bd81a8ed3a696cee7 +Subproject commit d7f248adf367585f0bd9c5febea6401a6cd6ea4f diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index f71b7952b4..623291fcb5 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -25,7 +25,7 @@ import torch import zmq from accelerate import init_empty_weights -from nemo_automodel import ( +from nemo_automodel._transformers.auto_model import ( NeMoAutoModelForSequenceClassification, ) from nemo_automodel.components.checkpoint._backports.filesystem import ( @@ -868,7 +868,7 @@ def train( num_label_tokens=1, dp_group_size=self.dp_size * self.cp_size, ) - grad_norm = grad_norm.detach().cpu().float() + grad_norm = torch.tensor(grad_norm, device="cpu", dtype=torch.float32) # Update parameters self.optimizer.step() diff --git a/uv.lock b/uv.lock index d85f6e0302..8ae775957d 100644 --- a/uv.lock +++ b/uv.lock @@ -350,6 +350,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/22/91616fe707a5c5510de2cac9b046a30defe7007ba8a0c04f9c08f27df312/audioop_lts-0.2.2-cp314-cp314t-win_arm64.whl", hash = "sha256:b492c3b040153e68b9fdaff5913305aaaba5bb433d8a7f73d5cf6a64ed3cc1dd", size = 25206, upload-time = "2025-08-05T16:43:16.444Z" }, ] +[[package]] +name = "audioread" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "standard-aifc", marker = "python_full_version >= '3.13'" }, + { name = "standard-sunau", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/4a/874ecf9b472f998130c2b5e145dcdb9f6131e84786111489103b66772143/audioread-3.1.0.tar.gz", hash = "sha256:1c4ab2f2972764c896a8ac61ac53e261c8d29f0c6ccd652f84e18f08a4cab190", size = 20082, upload-time = "2025-10-26T19:44:13.484Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/16/fbe8e1e185a45042f7cd3a282def5bb8d95bb69ab9e9ef6a5368aa17e426/audioread-3.1.0-py3-none-any.whl", hash = "sha256:b30d1df6c5d3de5dcef0fb0e256f6ea17bdcf5f979408df0297d8a408e2971b4", size = 23143, upload-time = "2025-10-26T19:44:12.016Z" }, +] + [[package]] name = "av" version = "15.0.0" @@ -407,19 +420,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload-time = "2025-04-15T17:05:12.221Z" }, ] -[[package]] -name = "bitsandbytes" -version = "0.45.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, - { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/b7/cb5ce4d1a382cf53c19ef06c5fc29e85f5e129b4da6527dd207d90a5b8ad/bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a5453f30cc6aab6ccaac364e6bf51a7808d3da5f71763dffeb6d9694c59136e4", size = 76059261, upload-time = "2025-04-07T13:32:52.573Z" }, - { url = "https://files.pythonhosted.org/packages/a6/4c/77b535e025ce780d2ada8271c1e481fb7337c1df2588a52fe1c9bd87d2e8/bitsandbytes-0.45.5-py3-none-win_amd64.whl", hash = "sha256:ed1c61b91d989d6a33fd05737d6edbf5086d8ebc89235ee632c7a19144085da2", size = 75430204, upload-time = "2025-04-07T13:32:57.553Z" }, -] - [[package]] name = "blake3" version = "1.0.5" @@ -1147,12 +1147,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/57/ecc9ae29fa5b2d90107cd1d9bf8ed19aacb74b2264d986ae9d44fe9bdf87/debugpy-1.8.16-py2.py3-none-any.whl", hash = "sha256:19c9521962475b87da6f673514f7fd610328757ec993bf7ec0d8c96f9a325f9e", size = 5287700, upload-time = "2025-08-06T18:00:42.333Z" }, ] +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + [[package]] name = "decord" version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" }, @@ -1217,6 +1226,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/ae/afb1487556e2dc827a17097aac8158a25b433a345386f0e249f6d2694ccb/devtools-0.12.2-py3-none-any.whl", hash = "sha256:c366e3de1df4cdd635f1ad8cbcd3af01a384d7abda71900e68d43b04eb6aaca7", size = 19411, upload-time = "2023-09-03T16:56:59.049Z" }, ] +[[package]] +name = "diffusers" +version = "0.35.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "importlib-metadata" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/68/288ca23c7c05c73e87ffe5efffc282400ac9b017f7a9bb03883f4310ea15/diffusers-0.35.2.tar.gz", hash = "sha256:30ecd552303edfcfe1724573c3918a8462ee3ab4d529bdbd4c0045f763affded", size = 3366711, upload-time = "2025-10-15T04:05:17.213Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/2e/38d9824f8c6bb048c5ba21c6d4da54c29c162a46b58b3ef907a360a76d3e/diffusers-0.35.2-py3-none-any.whl", hash = "sha256:d50d5e74fdd6dcf55e5c1d304bc52cc7c2659abd1752740d736d7b54078b4db5", size = 4121649, upload-time = "2025-10-15T04:05:14.391Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -1432,6 +1460,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] +[[package]] +name = "fla-core" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "einops" }, + { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, + { name = "torch", version = "2.8.0+cu129", source = { registry = "https://download.pytorch.org/whl/cu129" }, marker = "sys_platform != 'darwin'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/db/3d/79a9d5c8cd973c86f35403931031787dfc6cc97d838a42d4c62e8cbbb66f/fla_core-0.4.0.tar.gz", hash = "sha256:d975022b074e97bfd086dc6b767dccb35e27a9fe36f26f3b26b1c2b68b36a1c8", size = 316316, upload-time = "2025-10-27T08:18:51.673Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/0c/d52ab65e9c163631895052d70d4111f8530ca52f45beb0895378d1a2a8b5/fla_core-0.4.0-py3-none-any.whl", hash = "sha256:5396f36a9838c99f9e45c70e88e2e0b26688f719d07d2ddd61be16d29327f4ea", size = 438519, upload-time = "2025-10-27T08:18:49.561Z" }, +] + [[package]] name = "flash-attn" version = "2.8.1" @@ -1446,6 +1488,19 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/e8/6d/7066d160bdffa2f9da29a8c3957f266b17a03ca0b3bdc8fdae86d9881fe7/flash_attn-2.8.1.tar.gz", hash = "sha256:0ff003899fcb244f357905b04f622d5c9736887126dd6675f8f4bc52954e3923", size = 8166563, upload-time = "2025-07-10T05:16:39.729Z" } +[[package]] +name = "flash-linear-attention" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fla-core" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/9a/e546815da2bf149e0af58449ff1ca10074165af4384febead438ad46f74c/flash_linear_attention-0.4.0.tar.gz", hash = "sha256:c5d2bf6e1a766af3a4426f07f710b0b87809f7218de21eb313314be6ff1b0dba", size = 157646, upload-time = "2025-10-27T08:18:52.445Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/76/4f716c953608204c970de7cd4045db1af02643d7f19c94a49254834b7563/flash_linear_attention-0.4.0-py3-none-any.whl", hash = "sha256:50c97163f7cb64dc53585194ef36af44d2a6bc545227c4f73bb3ba9062630f1a", size = 290439, upload-time = "2025-10-27T08:18:50.589Z" }, +] + [[package]] name = "flask" version = "3.1.2" @@ -1617,6 +1672,18 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "ftfy" +version = "6.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a5/d3/8650919bc3c7c6e90ee3fa7fd618bf373cbbe55dff043bd67353dbb20cd8/ftfy-6.3.1.tar.gz", hash = "sha256:9b3c3d90f84fb267fe64d375a07b7f8912d817cf86009ae134aa03e1819506ec", size = 308927, upload-time = "2024-10-26T00:50:35.149Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl", hash = "sha256:7c70eb532015cd2f9adb53f101fb6c7945988d023a085d127d1573dc49dd0083", size = 44821, upload-time = "2024-10-26T00:50:33.425Z" }, +] + [[package]] name = "gguf" version = "0.17.1" @@ -1946,6 +2013,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "h2" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/17/afa56379f94ad0fe8defd37d6eb3f89a25404ffc71d4d848893d270325fc/h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1", size = 2152026, upload-time = "2025-08-23T18:12:19.778Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/b2/119f6e6dcbd96f9069ce9a2665e0146588dc9f88f29549711853645e736a/h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd", size = 61779, upload-time = "2025-08-23T18:12:17.779Z" }, +] + [[package]] name = "hatchling" version = "1.27.0" @@ -1976,6 +2056,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/d3/0aaf279f4f3dea58e99401b92c31c0f752924ba0e6c7d7bb07b1dbd7f35e/hf_xet-1.1.8-cp37-abi3-win_amd64.whl", hash = "sha256:4171f31d87b13da4af1ed86c98cf763292e4720c088b4957cf9d564f92904ca9", size = 2801689, upload-time = "2025-08-18T22:01:04.81Z" }, ] +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload-time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload-time = "2025-01-22T21:44:56.92Z" }, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -2026,6 +2115,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +http2 = [ + { name = "h2" }, +] + [[package]] name = "huggingface-hub" version = "0.34.4" @@ -2059,6 +2153,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, ] +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload-time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload-time = "2025-01-22T21:41:47.295Z" }, +] + [[package]] name = "identify" version = "2.6.13" @@ -2077,6 +2180,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "imageio-ffmpeg" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/c3343c721f2a1b0c9fc71c1aebf1966a3b7f08c2eea8ed5437a2865611d6/imageio_ffmpeg-0.6.0.tar.gz", hash = "sha256:e2556bed8e005564a9f925bb7afa4002d82770d6b08825078b7697ab88ba1755", size = 25210, upload-time = "2025-01-16T21:34:32.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/58/87ef68ac83f4c7690961bce288fd8e382bc5f1513860fc7f90a9c1c1c6bf/imageio_ffmpeg-0.6.0-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.whl", hash = "sha256:9d2baaf867088508d4a3458e61eeb30e945c4ad8016025545f66c4b5aaef0a61", size = 24932969, upload-time = "2025-01-16T21:34:20.464Z" }, + { url = "https://files.pythonhosted.org/packages/40/5c/f3d8a657d362cc93b81aab8feda487317da5b5d31c0e1fdfd5e986e55d17/imageio_ffmpeg-0.6.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b1ae3173414b5fc5f538a726c4e48ea97edc0d2cdc11f103afee655c463fa742", size = 21113891, upload-time = "2025-01-16T21:34:00.277Z" }, + { url = "https://files.pythonhosted.org/packages/33/e7/1925bfbc563c39c1d2e82501d8372734a5c725e53ac3b31b4c2d081e895b/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1d47bebd83d2c5fc770720d211855f208af8a596c82d17730aa51e815cdee6dc", size = 25632706, upload-time = "2025-01-16T21:33:53.475Z" }, + { url = "https://files.pythonhosted.org/packages/a0/2d/43c8522a2038e9d0e7dbdf3a61195ecc31ca576fb1527a528c877e87d973/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:c7e46fcec401dd990405049d2e2f475e2b397779df2519b544b8aab515195282", size = 29498237, upload-time = "2025-01-16T21:34:13.726Z" }, + { url = "https://files.pythonhosted.org/packages/a0/13/59da54728351883c3c1d9fca1710ab8eee82c7beba585df8f25ca925f08f/imageio_ffmpeg-0.6.0-py3-none-win32.whl", hash = "sha256:196faa79366b4a82f95c0f4053191d2013f4714a715780f0ad2a68ff37483cc2", size = 19652251, upload-time = "2025-01-16T21:34:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c6/fa760e12a2483469e2bf5058c5faff664acf66cadb4df2ad6205b016a73d/imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02fa47c83703c37df6bfe4896aab339013f62bf02c5ebf2dce6da56af04ffc0a", size = 31246824, upload-time = "2025-01-16T21:34:28.6Z" }, +] + [[package]] name = "imagesize" version = "1.4.1" @@ -2338,6 +2455,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/60/dfbbf40e3a371388c0e03ff65b01319b7d4023e883df6d7261125772ffdc/latex2sympy2_extended-1.10.2-py3-none-any.whl", hash = "sha256:f910442c5b02a466c1046f47d05cc5285181068b882399281f30102715337fb7", size = 207855, upload-time = "2025-07-02T15:26:04.88Z" }, ] +[[package]] +name = "lazy-loader" +version = "0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6b/c875b30a1ba490860c93da4cabf479e03f584eba06fe5963f6f6644653d8/lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1", size = 15431, upload-time = "2024-04-05T13:03:12.261Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/60/d497a310bde3f01cb805196ac61b7ad6dc5dcf8dce66634dc34364b20b4f/lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc", size = 12097, upload-time = "2024-04-05T13:03:10.514Z" }, +] + +[[package]] +name = "librosa" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioread" }, + { name = "decorator" }, + { name = "joblib" }, + { name = "lazy-loader" }, + { name = "msgpack" }, + { name = "numba" }, + { name = "numpy" }, + { name = "pooch" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "soundfile" }, + { name = "soxr" }, + { name = "standard-aifc", marker = "python_full_version >= '3.13'" }, + { name = "standard-sunau", marker = "python_full_version >= '3.13'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/36/360b5aafa0238e29758729e9486c6ed92a6f37fa403b7875e06c115cdf4a/librosa-0.11.0.tar.gz", hash = "sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908", size = 327001, upload-time = "2025-03-11T15:09:54.884Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/ba/c63c5786dfee4c3417094c4b00966e61e4a63efecee22cb7b4c0387dda83/librosa-0.11.0-py3-none-any.whl", hash = "sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1", size = 260749, upload-time = "2025-03-11T15:09:52.982Z" }, +] + [[package]] name = "liger-kernel" version = "0.6.2" @@ -3094,10 +3249,14 @@ wheels = [ name = "nemo-automodel" source = { editable = "3rdparty/Automodel-workspace/Automodel" } dependencies = [ - { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "datasets" }, + { name = "diffusers" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "megatron-fsdp" }, + { name = "mlflow" }, + { name = "opencv-python-headless" }, { name = "pybind11" }, { name = "pyyaml" }, { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" }, @@ -3109,6 +3268,25 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "backoff" }, + { name = "flash-linear-attention" }, + { name = "mistral-common", extra = ["opencv"] }, + { name = "numba" }, + { name = "numpy" }, + { name = "perceptron" }, + { name = "pillow" }, + { name = "qwen-omni-utils" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, + { name = "sentencepiece" }, + { name = "timm" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, +] +extra = [ + { name = "flash-linear-attention" }, + { name = "perceptron" }, + { name = "sentencepiece" }, +] fa = [ { name = "flash-attn" }, ] @@ -3121,10 +3299,10 @@ vlm = [ { name = "numba" }, { name = "numpy" }, { name = "pillow" }, - { name = "qwen-vl-utils", extra = ["decord"] }, + { name = "qwen-omni-utils" }, + { name = "qwen-vl-utils", extra = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "timm" }, - { name = "torchcodec" }, - { name = "transformers" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, ] [package.dev-dependencies] @@ -3158,36 +3336,45 @@ test = [ [package.metadata] requires-dist = [ { name = "backoff", marker = "extra == 'vlm'" }, - { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" }, { name = "datasets", specifier = ">=4.0.0" }, + { name = "diffusers" }, { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" }, + { name = "flash-linear-attention", marker = "extra == 'extra'" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, { name = "megatron-fsdp" }, { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, + { name = "mlflow" }, + { name = "nemo-automodel", extras = ["extra"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, + { name = "nemo-automodel", extras = ["vlm"], marker = "extra == 'all'", editable = "3rdparty/Automodel-workspace/Automodel" }, { name = "numba", marker = "extra == 'vlm'" }, { name = "numpy", marker = "extra == 'vlm'" }, + { name = "opencv-python-headless", specifier = "==4.10.0.84" }, + { name = "perceptron", marker = "extra == 'extra'" }, { name = "pillow", marker = "extra == 'vlm'" }, { name = "pybind11" }, { name = "pyyaml" }, - { name = "qwen-vl-utils", extras = ["decord"], marker = "extra == 'vlm'" }, - { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.8.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.8.0", index = "https://pypi.org/simple" }, + { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, + { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, + { name = "sentencepiece", marker = "extra == 'extra'" }, + { name = "timm", marker = "extra == 'vlm'", specifier = "<=1.0.22" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, { name = "torchao" }, - { name = "torchcodec", marker = "extra == 'vlm'" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, { name = "torchdata" }, { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" }, - { name = "transformers", specifier = "<=4.55.4" }, - { name = "transformers", marker = "extra == 'vlm'", specifier = "<=4.55.4" }, + { name = "transformers", specifier = "<=4.57.1" }, { name = "wandb" }, ] -provides-extras = ["vlm", "fa", "moe"] +provides-extras = ["vlm", "fa", "moe", "extra", "all"] [package.metadata.requires-dev] build = [ { name = "setuptools" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.8.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.8.0", index = "https://pypi.org/simple" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, ] dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }] docs = [ @@ -4203,6 +4390,24 @@ requires-dist = [ { name = "yappi" }, ] +[[package]] +name = "perceptron" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, + { name = "httpx", extra = ["http2"] }, + { name = "numpy" }, + { name = "pillow" }, + { name = "rich" }, + { name = "shellingham" }, + { name = "typer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/30/60/85db2243d8b550823603d8f9c5845b0dd0f01074e9aabf0b2af0c4f52565/perceptron-0.1.4.tar.gz", hash = "sha256:62fd190efb74925e2cc33c0cd38761e19959be3bdb7b24fbf9e3386d6961f690", size = 78116, upload-time = "2025-11-12T20:00:28.024Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/17/b7cb1a10ebb0a9a4c9fbcd96a28b43d44e08a90f620bab07e644a658d2f1/perceptron-0.1.4-py3-none-any.whl", hash = "sha256:f490a6df6c15167e91e1a528601cae98ce99a30991cf792f9ef83ebc15d335c4", size = 57421, upload-time = "2025-11-12T20:00:26.395Z" }, +] + [[package]] name = "pillow" version = "11.3.0" @@ -4300,6 +4505,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "pooch" +version = "1.8.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging" }, + { name = "platformdirs" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/77/b3d3e00c696c16cf99af81ef7b1f5fe73bd2a307abca41bd7605429fe6e5/pooch-1.8.2.tar.gz", hash = "sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10", size = 59353, upload-time = "2024-06-06T16:53:46.224Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/87/77cc11c7a9ea9fd05503def69e3d18605852cd0d4b0d3b8f15bbeb3ef1d1/pooch-1.8.2-py3-none-any.whl", hash = "sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47", size = 64574, upload-time = "2024-06-06T16:53:44.343Z" }, +] + [[package]] name = "pre-commit" version = "4.3.0" @@ -5096,6 +5315,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/32/3836ed85947b06f1d67c07ce16c00b0cf8c053ab0b249d234f9f81ff95ff/pyzmq-27.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:0fc24bf45e4a454e55ef99d7f5c8b8712539200ce98533af25a5bfa954b6b390", size = 575098, upload-time = "2025-08-03T05:04:27.974Z" }, ] +[[package]] +name = "qwen-omni-utils" +version = "0.0.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "av" }, + { name = "librosa" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b8/b1/cc58b03b5eadddc0812cef884d013ed6cc66b09f9b0f5b45123f89dcd056/qwen_omni_utils-0.0.8.tar.gz", hash = "sha256:b5808673e1455f4115cb784a62cdc8e8616576221a01fc738610b0f9268cb33c", size = 8145, upload-time = "2025-06-12T11:02:05.411Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/b1/dcdd69246a3c3c3bd6f6ced58e2307b3afbd894c4412c29fd49dd897e562/qwen_omni_utils-0.0.8-py3-none-any.whl", hash = "sha256:c42bcc633fbfd84d565ff0de9d45fae68a6b57a9b7b97a4b77eda71a0d3ee73a", size = 9218, upload-time = "2025-06-12T11:02:03.981Z" }, +] + [[package]] name = "qwen-vl-utils" version = "0.0.11" @@ -5113,7 +5348,7 @@ wheels = [ [package.optional-dependencies] decord = [ - { name = "decord" }, + { name = "decord", marker = "(platform_machine != 'aarch64' and sys_platform != 'darwin') or sys_platform == 'win32'" }, ] [[package]] @@ -6033,6 +6268,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload-time = "2024-12-10T12:05:27.824Z" }, ] +[[package]] +name = "standard-aifc" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, + { name = "standard-chunk", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/53/6050dc3dde1671eb3db592c13b55a8005e5040131f7509cef0215212cb84/standard_aifc-3.13.0.tar.gz", hash = "sha256:64e249c7cb4b3daf2fdba4e95721f811bde8bdfc43ad9f936589b7bb2fae2e43", size = 15240, upload-time = "2024-10-30T16:01:31.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/52/5fbb203394cc852334d1575cc020f6bcec768d2265355984dfd361968f36/standard_aifc-3.13.0-py3-none-any.whl", hash = "sha256:f7ae09cc57de1224a0dd8e3eb8f73830be7c3d0bc485de4c1f82b4a7f645ac66", size = 10492, upload-time = "2024-10-30T16:01:07.071Z" }, +] + +[[package]] +name = "standard-chunk" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/06/ce1bb165c1f111c7d23a1ad17204d67224baa69725bb6857a264db61beaf/standard_chunk-3.13.0.tar.gz", hash = "sha256:4ac345d37d7e686d2755e01836b8d98eda0d1a3ee90375e597ae43aaf064d654", size = 4672, upload-time = "2024-10-30T16:18:28.326Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/90/a5c1084d87767d787a6caba615aa50dc587229646308d9420c960cb5e4c0/standard_chunk-3.13.0-py3-none-any.whl", hash = "sha256:17880a26c285189c644bd5bd8f8ed2bdb795d216e3293e6dbe55bbd848e2982c", size = 4944, upload-time = "2024-10-30T16:18:26.694Z" }, +] + +[[package]] +name = "standard-sunau" +version = "3.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "audioop-lts", marker = "python_full_version >= '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/e3/ce8d38cb2d70e05ffeddc28bb09bad77cfef979eb0a299c9117f7ed4e6a9/standard_sunau-3.13.0.tar.gz", hash = "sha256:b319a1ac95a09a2378a8442f403c66f4fd4b36616d6df6ae82b8e536ee790908", size = 9368, upload-time = "2024-10-30T16:01:41.626Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/ae/e3707f6c1bc6f7aa0df600ba8075bfb8a19252140cd595335be60e25f9ee/standard_sunau-3.13.0-py3-none-any.whl", hash = "sha256:53af624a9529c41062f4c2fd33837f297f3baa196b0cfceffea6555654602622", size = 7364, upload-time = "2024-10-30T16:01:28.003Z" }, +] + [[package]] name = "starlette" version = "0.47.2" @@ -6439,9 +6708,7 @@ name = "torchcodec" version = "0.6.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/b3/11326a0e7a3c803a95975cfce4ac88fa4ea1a0d432bb876081046c5a5554/torchcodec-0.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fba260145a239b5afe13336e3a5bc1b089c9c31a073e9a7c2026d4cbd853fdd9", size = 3482584, upload-time = "2025-08-07T08:51:32.535Z" }, { url = "https://files.pythonhosted.org/packages/a7/d1/3f90561df013f6a015ef19de22726b64073fee405f53d3c4b8255ab05a67/torchcodec-0.6.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:fdef91a17fb1f1a159ce23710324a9a4e6d6a885275de73700f94a9ad562c6b2", size = 1370954, upload-time = "2025-08-07T08:51:15.021Z" }, - { url = "https://files.pythonhosted.org/packages/87/d0/0b5dd42652e4527d578e1d6239dbb907bf83e502115e517b83a55d8b7f8b/torchcodec-0.6.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de20cab5df7fa7cdd74ec1dc0d508324685573f86de6789f0ebb860b7ea20b33", size = 3446017, upload-time = "2025-08-07T08:51:34.484Z" }, { url = "https://files.pythonhosted.org/packages/97/62/a938334e39101d4304619b90847d8aef7d1c607c6bcf33638f72931ae990/torchcodec-0.6.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:46dab701a2d809e975a8b07d7ee47ed34f1d903511e374c74cfc1de6a5ab0e3f", size = 1374794, upload-time = "2025-08-07T08:51:17.355Z" }, ] From ed69abdf1d9f7e9d3fe2b07fb021039ce833e15d Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 25 Nov 2025 15:45:31 -0800 Subject: [PATCH 28/32] pre-commit Signed-off-by: adil-a --- .../models/policy/dtensor_policy_worker_v2.py | 4 +- pyproject.toml | 50 +++++++++---------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 623291fcb5..b6703f8137 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -868,7 +868,9 @@ def train( num_label_tokens=1, dp_group_size=self.dp_size * self.cp_size, ) - grad_norm = torch.tensor(grad_norm, device="cpu", dtype=torch.float32) + grad_norm = torch.tensor( + grad_norm, device="cpu", dtype=torch.float32 + ) # Update parameters self.optimizer.step() diff --git a/pyproject.toml b/pyproject.toml index e2b56d1386..5be4aa2dda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,33 +53,33 @@ dependencies = [ [project.optional-dependencies] # Currently unused, but after https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved, we should use this for the "BASE" PYEXECUTABLE automodel = [ - "nemo-automodel", - # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) - # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 - # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 - "vllm==0.11.0", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved - "flash-attn==2.8.1", - "mamba-ssm", - "causal-conv1d", - "grouped_gemm @ git+https://github.com/fanshiqing/grouped_gemm@v1.1.4", - "transformer-engine[pytorch]==2.8.0", - "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", + "nemo-automodel", + # Flash-attn version should be selected to satisfy both TE + vLLM requirements (xformers in particular) + # https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108 + # https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76 + "vllm==0.11.0", # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved + "flash-attn==2.8.1", + "mamba-ssm", + "causal-conv1d", + "grouped_gemm @ git+https://github.com/fanshiqing/grouped_gemm@v1.1.4", + "transformer-engine[pytorch]==2.8.0", + "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", ] vllm = [ - "cuda-python", - "deep_gemm @ git+https://github.com/deepseek-ai/DeepGEMM.git@7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c", - # deep_ep also needs libibverbs-dev - # sudo apt-get update - # sudo apt-get install libibverbs-dev - "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", - "vllm==0.11.0", - "num2words>=0.5.14", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "flash-attn==2.8.1", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "mamba-ssm", - # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved - "causal-conv1d", + "cuda-python", + "deep_gemm @ git+https://github.com/deepseek-ai/DeepGEMM.git@7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c", + # deep_ep also needs libibverbs-dev + # sudo apt-get update + # sudo apt-get install libibverbs-dev + "deep_ep @ git+https://github.com/deepseek-ai/DeepEP.git@bfded34800dfec415b71503f8205181de90b2480", + "vllm==0.11.0", + "num2words>=0.5.14", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "flash-attn==2.8.1", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "mamba-ssm", + # Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved + "causal-conv1d", ] mcore = [ # also need cudnn (https://developer.nvidia.com/cudnn-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=20.04&target_type=deb_network) From b754c7c24f137bf1299b54764522bef0e7eafa4e Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 25 Nov 2025 16:09:46 -0800 Subject: [PATCH 29/32] ckpt fix Signed-off-by: adil-a --- .../recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 4 ++++ nemo_rl/models/policy/dtensor_policy_worker_v2.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index 1910cb665f..f39f6815af 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -25,3 +25,7 @@ policy: master_weights: true exp_avg_dtype: bfloat16 exp_avg_sq_dtype: bfloat16 +checkpointing: + enabled: true + checkpoint_dir: "results/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel" + save_period: 10 diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index b6703f8137..0ea4328c1b 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -1917,6 +1917,7 @@ def save_checkpoint( the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. """ + print(f"Saving checkpoint to {weights_path}") if checkpointing_cfg is None: raise ValueError( "checkpointing_cfg must be provided when saving checkpoint" @@ -1988,6 +1989,7 @@ def load_checkpoint( config_updates={ "model_save_format": model_save_format, "is_peft": is_peft, + "dequantize_base_checkpoint": False, # the saved checkpoint is already dequantized }, checkpoint_root=checkpoint_root, ) From 3877e79b77f5fe09cbe141bc5f8d65992a829253 Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 25 Nov 2025 21:25:13 -0800 Subject: [PATCH 30/32] pre commit Signed-off-by: adil-a --- .../recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml index f39f6815af..0f150b1c4a 100644 --- a/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml +++ b/examples/configs/recipes/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.yaml @@ -26,6 +26,4 @@ policy: exp_avg_dtype: bfloat16 exp_avg_sq_dtype: bfloat16 checkpointing: - enabled: true - checkpoint_dir: "results/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel" - save_period: 10 + checkpoint_dir: results/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel From 661b59697ba57cd292c0c47ecbd6df75f6ee6e9d Mon Sep 17 00:00:00 2001 From: adil-a Date: Tue, 25 Nov 2025 21:27:18 -0800 Subject: [PATCH 31/32] Sync Automodel submodule to origin/main --- 3rdparty/Automodel-workspace/Automodel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel index d7f248adf3..756ed10c29 160000 --- a/3rdparty/Automodel-workspace/Automodel +++ b/3rdparty/Automodel-workspace/Automodel @@ -1 +1 @@ -Subproject commit d7f248adf367585f0bd9c5febea6401a6cd6ea4f +Subproject commit 756ed10c29039cd9af551761d054a526021f559d From d89180c46f15fed2d56ed06536a8c80623425a9e Mon Sep 17 00:00:00 2001 From: adil-a Date: Wed, 26 Nov 2025 09:27:52 -0800 Subject: [PATCH 32/32] removing RL specific changes for future PR Signed-off-by: adil-a --- nemo_rl/models/policy/dtensor_policy_worker_v2.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py index 0ea4328c1b..5e749d4815 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py +++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py @@ -184,7 +184,7 @@ def __init__( hf_config_overrides = self.cfg.get("hf_config_overrides", {}) or {} - # Choose attention implementation consistent with train_ft.py logic + # Choose attention implementation on the following basis: # - Packed sequence requires FA2 and CP must be 1 # - CP > 1 requires SDPA cp_size_cfg = self.cfg["dtensor_cfg"]["context_parallel_size"] @@ -273,7 +273,7 @@ def __init__( **automodel_model_kwargs, ) - # Hold a copy of model state_dict keys before any parallelization (as in train_ft.py) + # Hold a copy of model state_dict keys before any parallelization self.model_state_dict_keys = list(self.model.state_dict().keys()) if self.model.config.pad_token_id is None: @@ -1724,9 +1724,7 @@ def maybe_init_zmq(self): def prepare_refit_info(self) -> Optional[dict[str, Any]]: """Prepare state dict metadata for weight refitting and IPC streaming.""" state_dict_info = {} - state_dict = self.model.state_dict() - state_dict = _maybe_adapt_state_dict_to_hf(self.model, state_dict) - for name, tensor in state_dict.items(): + for name, tensor in self.model.state_dict().items(): # all tensor will be casted to self.dtype in stream_weights_via_ipc_zmq/broadcast_weights_for_collective state_dict_info[name] = (tensor.shape, self.dtype) @@ -1752,9 +1750,7 @@ def stream_weights_via_ipc_zmq(self, buffer_size_bytes: int = 0) -> None: def dtensor_params_generator(): """Generator that yields (name, tensor) pairs, converting DTensors to local tensors.""" - state_dict = self.model.state_dict() - state_dict = _maybe_adapt_state_dict_to_hf(self.model, state_dict) - for name, tensor in state_dict.items(): + for name, tensor in self.model.state_dict().items(): if isinstance(tensor, DTensor): # Convert DTensor to full tensor for streaming full_tensor = tensor.full_tensor()