From c92459488b0b059190c96a21c20cec8414894ba1 Mon Sep 17 00:00:00 2001
From: drisspg <drisspguessous@gmail.com>
Date: Tue, 22 Oct 2024 13:53:34 -0700
Subject: [PATCH 001/161] Fix test on windows (#138641)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138641
Approved by: https://github.com/huydhn
---
 test/test_transformers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_transformers.py b/test/test_transformers.py
index f9bfb71b658a1..0329d86fafb23 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -2817,7 +2817,9 @@ def test_fused_sdp_choice(self, device, type: str):
         elif PLATFORM_SUPPORTS_FLASH_ATTENTION:
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.FLASH_ATTENTION.value)
         elif type != "nested" and PLATFORM_SUPPORTS_CUDNN_ATTENTION:  # e.g., we're on Windows
-            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
+            self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.EFFICIENT_ATTENTION.value)
+            with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION]):
+                self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.CUDNN_ATTENTION.value)
         else:
             self.assertEqual(torch._fused_sdp_choice(query, key, value), SDPBackend.EFFICIENT_ATTENTION.value)
 

From 1b312489335f9c27fb85c1a93f77bc17edf5d074 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Wed, 23 Oct 2024 22:15:35 +0000
Subject: [PATCH 002/161] [EZ] Fix typo in test_mps.py (#138738)

s/emedding_weight/embedding_weight/

Stolen from https://github.com/pytorch/pytorch/pull/138640/commits/074766d9b4eddec7b0d757fcadedabbb2d54d358

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138738
Approved by: https://github.com/atalman
---
 test/test_mps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 94b092aaede02..6a9adf7decd8c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7385,7 +7385,7 @@ def helper(value, dim, index, idx_dtype=torch.int32):
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
             embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps')
-            emedding_weight = embeddingMPS.weight.detach().cpu()
+            embedding_weight = embeddingMPS.weight.detach().cpu()
             W_MPS = torch.randn((m, d), requires_grad=True, device='mps')
             idx_MPS = torch.tensor(idx, device='mps')
             a_MPS = embeddingMPS.weight.clone() @ W_MPS.t()  # weight must be cloned for this to be differentiable
@@ -7396,7 +7396,7 @@ def helper(n, d, m, idx):
             loss_MPS = out_MPS.sigmoid().prod()
             loss_MPS.backward()
 
-            embeddingCPU = nn.Embedding(n, d, max_norm=True, _weight=emedding_weight)
+            embeddingCPU = nn.Embedding(n, d, max_norm=True, _weight=embedding_weight)
             W_CPU = W_MPS.to('cpu')
             idx_CPU = torch.tensor(idx)
             a_CPU = embeddingCPU.weight.clone() @ W_CPU.t()  # weight must be cloned for this to be differentiable

From 889717aabdfd48de0086309b57310792cf2e2ad6 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 23 Oct 2024 22:38:30 +0000
Subject: [PATCH 003/161] [CI/CD] Disable split build (#138752)

See https://github.com/pytorch/pytorch/issues/138750

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138752
Approved by: https://github.com/kit1980, https://github.com/huydhn
---
 .github/scripts/generate_ci_workflows.py      |   60 +-
 ...ated-linux-binary-manywheel-split-main.yml |  182 --
 ...d-linux-binary-manywheel-split-nightly.yml | 1796 -----------------
 .github/workflows/periodic.yml                |    3 +
 .github/workflows/pull.yml                    |    1 +
 .github/workflows/trunk.yml                   |    1 +
 6 files changed, 36 insertions(+), 2007 deletions(-)
 delete mode 100644 .github/workflows/generated-linux-binary-manywheel-split-main.yml
 delete mode 100644 .github/workflows/generated-linux-binary-manywheel-split-nightly.yml

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index f9c857a3ed9cb..e99f95944245c 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -114,20 +114,21 @@ class OperatingSystem:
             isolated_workflow=True,
         ),
     ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            use_split_build=True,
-            arches=["11.8", "12.1", "12.4", "cpu"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-        use_split_build=True,
-    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    #   BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         use_split_build=True,
+    #         arches=["11.8", "12.1", "12.4", "cpu"],
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+    #         isolated_workflow=True,
+    #     ),
+    #     use_split_build=True,
+    # ),
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="conda",
@@ -180,21 +181,22 @@ class OperatingSystem:
         ),
         branches="main",
     ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
-            python_versions=["3.9"],
-            use_split_build=True,
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_PERIODIC},
-        ),
-        branches="main",
-        use_split_build=True,
-    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    # BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         arches=["11.8", "12.1", "12.4"],
+    #         python_versions=["3.9"],
+    #         use_split_build=True,
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_PERIODIC},
+    #     ),
+    #     branches="main",
+    #     use_split_build=True,
+    # ),
     BinaryBuildWorkflow(
         os=OperatingSystem.LINUX,
         package_type="libtorch",
diff --git a/.github/workflows/generated-linux-binary-manywheel-split-main.yml b/.github/workflows/generated-linux-binary-manywheel-split-main.yml
deleted file mode 100644
index 59f6c4b6f03a1..0000000000000
--- a/.github/workflows/generated-linux-binary-manywheel-split-main.yml
+++ /dev/null
@@ -1,182 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel-split
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/periodic/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel-split
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
deleted file mode 100644
index c87a511094e2c..0000000000000
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ /dev/null
@@ -1,1796 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel-split
-
-
-on:
-  push:
-    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
-    branches:
-      - nightly
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel-split
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_9-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_10-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_10-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_10-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_10-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_1-full
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_1-full
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_11-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13t-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13t-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13t-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_13t-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cpu
-      build_environment: linux-binary-manywheel-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cpu-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 098fbbaa6c3dd..f5e71bceb4d26 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -333,6 +333,7 @@ jobs:
     name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
+    if: false # See https://github.com/pytorch/pytorch/issues/138750
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       use_split_build: true
@@ -363,6 +364,7 @@ jobs:
     name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
+    if: false # See https://github.com/pytorch/pytorch/issues/138750
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       use_split_build: true
@@ -390,6 +392,7 @@ jobs:
     name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
+    if: false # See https://github.com/pytorch/pytorch/issues/138750
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       use_split_build: true
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c2cd5ba2f7456..17b700f9eadf4 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -579,6 +579,7 @@ jobs:
     secrets: inherit
 
   linux-focal-py3_12-clang10-experimental-split-build:
+    if: false # See https://github.com/pytorch/pytorch/issues/138750
     name: linux-focal-py3.12-clang10-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index fd1f5445cea77..276cb9155b58a 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -256,6 +256,7 @@ jobs:
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
 
   linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
+    if: false # See https://github.com/pytorch/pytorch/issues/138750
     name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type

From 8945309c08f8c373930604db9a489a7e3d576968 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 21 Oct 2024 13:16:25 -0700
Subject: [PATCH 004/161] [Pipelining] fix extra memory usage in zero bubble
 (#138119)

Full debugging details in here: https://docs.google.com/document/d/1Pe_E0KWAfsJ6MCvKZ5aR28rTXX-rYLg13XxwXd6AALw/edit?usp=sharing

In zero bubble, we have two methods `stage_backward_input` and `stage_backward_weight`. During `stage_backward_input` we compute the gradients of the input with respect to the stage outputs and also retain the graph of the autograd graph (different than 1F1B where `retain_graph=False`). The output / loss was still being retained across the next schedule step() because we return the loss to the user and use the output to the next step. To allow autograd to free the variables in the graph we need to detach the output/loss after we don't need to use it autograd anymore.

Pre-fix:
<img width="1021" alt="image" src="https://github.com/user-attachments/assets/6c8bf469-32b1-4dac-85ff-b97991f9f0e3">

Post-fix:
<img width="1039" alt="image" src="https://github.com/user-attachments/assets/a1875038-e80b-4dd4-84f2-38727d7792dc">

without AC (7B model on titan):
10% memory improvement

with AC (7B model on titan)
50% memory improvement

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138119
Approved by: https://github.com/wconstab, https://github.com/kwen2501
---
 torch/distributed/pipelining/_backward.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index d476cd0d47872..f49d8fdbfb5e2 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -201,8 +201,15 @@ def hook(grad_inputs):
                 inp.grad = dinputs[i]
             else:
                 inp.grad += dinputs[i]
+
+        # stage_outputs are not used in backwards after this point, so we can safely remove it from the autograd graph
+        # this allows autograd to clear up the graph dedicated for this output and free up significant memory
+        for t in stage_outputs:
+            t.detach_()
+
     else:
         dinputs = None
+
     return dinputs, param_groups
 
 

From 32a3dbc6450171dec4ef62a36037dd5dc24790d2 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 21 Oct 2024 13:25:02 -0700
Subject: [PATCH 005/161] [Pipelining] Free memory usage earlier in last stage
 (#138504)

This fix is similar to that done in #138119, except this is an edge case for the last stage. For the last stage we perform backward on the `loss` which we detached in the previous PR. However, we also hold the `stage_outputs` alive because we return all the output chunks in `merge_output_chunks()` after the step is over. This will also still keep the autograd graph alive, so detaching these tensors frees the memory earlier.

pre-fix:
<img width="1780" alt="image" src="https://github.com/user-attachments/assets/bb78bde7-fd5c-4eba-bfc9-f0359e20bbab">

post-fix:
<img width="1788" alt="image" src="https://github.com/user-attachments/assets/a26102d9-9db2-4fc8-946c-336b8430657c">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138504
Approved by: https://github.com/wconstab
ghstack dependencies: #138119
---
 torch/distributed/pipelining/stage.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index 782dddd5d684e..9c47f68e20c07 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -695,6 +695,13 @@ def backward_one_chunk(
                 self.grads_input = grads_input
                 # Save a placeholder for the dw_runner
                 self.dw_runner[bwd_chunk_id] = lambda: None
+
+        if self.is_last:
+            # stage_output is no longer used in the last stage for backward and only needed
+            # to return to the user in merge_output_chunks, therefore
+            # this should be detached to release autograd graph context and free memory earlier
+            for t in stage_output:
+                t.detach_()
         logger.debug("%s Backwarded chunk %s", self.log_prefix, bwd_chunk_id)
 
     def backward_weight_one_chunk(self, bwd_chunk_id: int):

From 0b9320b7c540812e210e5e2bbef16cf85bccc80d Mon Sep 17 00:00:00 2001
From: "Colin L. Rice" <clr@fb.com>
Date: Thu, 24 Oct 2024 01:30:39 +0000
Subject: [PATCH 006/161] fx_graph_cache: Remove custom amd JK (#137501)

This split in JKs was never actually used (We just set both JKs to the same values except when we accidentally didn't due to being humans who make mistakes). This simplifies the overall JK structure and eventually, will let us delete the duplicate JK

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137501
Approved by: https://github.com/oulgen
---
 torch/_inductor/utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 4f3d4c1808b2c..383291c56ee21 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -2095,11 +2095,9 @@ def should_use_remote_fx_graph_cache():
     except ModuleNotFoundError:
         return False
 
-    jk_name = "pytorch/remote_cache:fx_graph_memcache_version"
-    if torch.version.hip is not None:
-        jk_name = "pytorch/remote_cache:fx_graph_memcache_version_amd"
-
-    return REMOTE_CACHE_VERSION >= torch._utils_internal.justknobs_getval_int(jk_name)
+    return REMOTE_CACHE_VERSION >= torch._utils_internal.justknobs_getval_int(
+        "pytorch/remote_cache:fx_graph_memcache_version"
+    )
 
 
 def normalize_name(name: str) -> str:

From d8f22a114142f40f78e1145e45d7eaf823b516cb Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@gmail.com>
Date: Thu, 24 Oct 2024 02:15:47 +0000
Subject: [PATCH 007/161] [c10d] Reorder GIL checker and c++ stack trace print
 with comments (#138734)

We found one case when the GIL deadlock happens and then FR timeout, I am wondering if we can do the GIL check before cpp stack trace print which can lead to hang

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138734
Approved by: https://github.com/c-p-i-o
---
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 6a4b240c51bdd..eb16d6e09c904 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1573,6 +1573,15 @@ void ProcessGroupNCCL::heartbeatMonitor() {
   }
   LOG(ERROR) << errorMsg;
 
+  // We perform some checks to help users debug the timeout/hang issue:
+  // 1. Dump the nccl trace (flight recorder) to help debug the issue
+  //    (timeout after waitTimeoutDumpInMilSec_, which is one minute).
+  // 2. Check if there is a GIL deadlock (timeout after 300ms).
+  // 3. Try to dump the c++ stacktraces (blocking and would hang,
+  //    users can turn this off by set
+  //    TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN=0).
+
+  // Dump the nccl trace (flight recorder).
   if (checkDumpSignal && shouldDump_.load()) {
     // Store debug info to storage if no other thread does it. (By default to
     // local disk)
@@ -1588,14 +1597,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         true);
   }
 
-  auto& cpp_dumper = get_cpp_trace_dumper();
-  if (logCppStackOnUncleanShutdown_ && cpp_dumper.has_value()) {
-    LOG(INFO) << logPrefix() << "Dumping c++ stacktraces:";
-    cpp_dumper.value()(
-        [&](const std::string& line) { LOG(INFO) << logPrefix() << line; });
-    LOG(INFO) << logPrefix() << "Finished c++ stacktraces dump.";
-  }
-
+  // GIL deadlock check.
   if (get_gil_checker() != nullptr) {
     auto fut = launchAsyncGilCheck();
     auto kGilCheckTimeout = std::chrono::milliseconds(300);
@@ -1614,6 +1616,15 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         << "GIL checker was not registered, perhaps this is a no-python build?";
   }
 
+  // Dump the c++ stacktraces.
+  auto& cpp_dumper = get_cpp_trace_dumper();
+  if (logCppStackOnUncleanShutdown_ && cpp_dumper.has_value()) {
+    LOG(INFO) << logPrefix() << "Dumping c++ stacktraces:";
+    cpp_dumper.value()(
+        [&](const std::string& line) { LOG(INFO) << logPrefix() << line; });
+    LOG(INFO) << logPrefix() << "Finished c++ stacktraces dump.";
+  }
+
   // There are two possible cases for the watchdog thread exit:
   // Case one: desync report runs quickly, and it follows the step:
   // collective timeout -> desync -> exception handling -> destructors

From e5c3d7ab77fdad885b575824386bd1c770a24bdf Mon Sep 17 00:00:00 2001
From: Doru Bercea <doru.bercea@amd.com>
Date: Thu, 24 Oct 2024 03:41:16 +0000
Subject: [PATCH 008/161] [ROCm] Improve performance of reductions on 1D and 2D
 tensors. (#137737)

This patch improves the performance of individual reductions on MI300X. These improvements are measured on individual sum reduction operations of varying sizes. The patch impacts the following tensor types:
- 1D tensors
- 2D tensors when reducing along dimension 0
- 2D tensors when reducing along dimension 1

Runtime reduction between 0 and 75% depending on tensor shape.

The patch uses the maximum number of threads per CU and the number of CUs itself to control the number of threadblocks in various situations (i.e. for various reduction types and tensor dimensions).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137737
Approved by: https://github.com/eqy, https://github.com/jeffdaily, https://github.com/pruthvistony, https://github.com/xw285cornell
---
 aten/src/ATen/native/cuda/Reduce.cuh | 34 ++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 5f03d7b9bda57..4baa3bd560a6d 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -1092,11 +1092,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   }
 
   constexpr int min_values_per_thread = 16;
-#ifndef USE_ROCM
   constexpr int max_values_per_thread = 256;
-#else
-  constexpr int max_values_per_thread = 1024;
-#endif
 
   if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
     // Divide the input across warps in a thread-block, if that leaves at least
@@ -1108,7 +1104,18 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
     config.output_mult[1] = config.split_output(block_height);
   }
 
-  const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads;
+  int max_threads_per_mp =
+      at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor;
+#ifdef USE_ROCM
+  // Control the number of threadblocks by adjusting the maximum number of
+  // threads per multi-processor. These numbers better reflect the maximum
+  // theoretical achievable threads per MP for the reduction operation.
+  if (iter.ndim() == 1)
+    max_threads_per_mp = 512;
+  if (iter.ndim() == 2)
+    max_threads_per_mp = 256;
+#endif
+  const int blocks_per_sm = max_threads_per_mp / config.num_threads;
   const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
   const int target_grid_size = num_mp * blocks_per_sm;
   int grid = config.grid().x;
@@ -1126,6 +1133,23 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
     // a large number of values to deal with. But we don't want values_per_thread to be larger than
     // max_values_per_thread
     config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
+#ifdef USE_ROCM
+    // In cases where a number of threadblocks along the y direction of the grid
+    // is needed then make sure they are reduced to the number of MPs. For
+    // smaller sizes, use half the number of MPs. For smaller sizes than half
+    // the number of MPs use the original value unless the value is less than 16
+    // blocks in which case it is more profitable to use just 1 block.
+    if (config.ctas_per_output > num_mp)
+      if (num_mp < 128)
+        config.ctas_per_output =
+            num_mp * (config.ctas_per_output > 512 ? 4 : 2);
+      else
+        config.ctas_per_output = num_mp;
+    else if (config.ctas_per_output > div_up(num_mp, 2))
+      config.ctas_per_output = div_up(num_mp, 2);
+    else if (config.ctas_per_output < 16)
+      config.ctas_per_output = 1;
+#endif
     if (config.ctas_per_output > 1) {
       config.input_mult[2] = config.split_input(config.ctas_per_output);
     }

From b1acd0978e2d2d0292ed5323647793e338d1576b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 23 Oct 2024 13:35:44 -0700
Subject: [PATCH 009/161] [dynamo] Support range_iterator as a function input
 (#138657)

Fixes https://github.com/pytorch/pytorch/issues/138654

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138657
Approved by: https://github.com/williamwen42, https://github.com/jansel
---
 test/dynamo/test_repros.py         | 30 +++++++++++++++++++++++++++++-
 test/test_reductions.py            |  3 ++-
 torch/_dynamo/utils.py             |  1 +
 torch/_dynamo/variables/builder.py | 10 ++++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 9feac201a333c..e380939570cea 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -37,7 +37,7 @@
 import torch.utils._pytree as pytree
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
-from torch._dynamo.testing import CompileCounter, rand_strided, same
+from torch._dynamo.testing import CompileCounter, rand_strided, same, skipIfPy312
 from torch._inductor.utils import fresh_inductor_cache
 from torch.nn import functional as F
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
@@ -6170,6 +6170,34 @@ def fn(x, y):
 
         self.assertEqual(ref, res)
 
+    @skipIfPy312  # listcomp bytecode is optimized
+    def test_listcomp(self):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._num = 4
+
+            @torch._dynamo.disable(recursive=False)
+            def forward(self, x):
+                values = [i * torch.cos(x) for i in range(self._num)]
+                return sum(values)
+
+        mod = Module()
+
+        def fn(x):
+            return mod(x)
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnt)
+        x = torch.randn(4)
+
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnt.frame_count, 1)
+        # Ensure that the listcomp is fully compiled
+        self.assertEqual(cnt.op_count, 8)
+
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 323866c80153c..1e2625c4f606e 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -20,6 +20,7 @@
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
     parametrize,
+    skipIfTorchDynamo,
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     OpDTypes, expectedFailureMeta, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
@@ -2589,7 +2590,7 @@ def check(op, a, args, key):
         self.assertEqual(a[:, ::2, :].median(-1)[0], torch.tensor([[0, 4], [6, 10]], device=device))
         self.assertEqual(a[:, ::2, :].nanmedian(-1)[0], torch.tensor([[0, 4], [6, 10]], device=device))
 
-
+    @skipIfTorchDynamo("https://github.com/pytorch/pytorch/pull/138657 discovers a latent bug")
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
     def test_quantile(self, device, dtype):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 5fa5104df0d13..c24c10d7e4e2f 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1508,6 +1508,7 @@ def check_numpy_ndarray_args(args, kwargs):
 dict_values: Type[ValuesView[Any]] = type({}.values())
 odict_values: Type[ValuesView[Any]] = type(collections.OrderedDict().values())
 tuple_iterator: Type[Iterator[Any]] = type(iter(()))
+range_iterator: Type[Iterator[Any]] = type(iter(range(0)))
 tuple_iterator_len = tuple_iterator.__length_hint__  # type: ignore[attr-defined]
 object_new = object.__new__
 
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index eeebb39af9f11..f8d8f0f54dc06 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -3,6 +3,7 @@
 import abc
 import collections
 import contextlib
+import copy
 import dataclasses
 import enum
 import functools
@@ -106,6 +107,7 @@
     istype,
     odict_values,
     proxy_args_kwargs,
+    range_iterator,
     set_example_value,
     tensor_always_has_static_shape,
     tuple_iterator,
@@ -153,6 +155,7 @@
 from .lazy import LazyVariableTracker
 from .lists import (
     BaseListVariable,
+    ListIteratorVariable,
     ListVariable,
     NamedTupleVariable,
     RangeVariable,
@@ -448,6 +451,7 @@ def _type_dispatch(cls):
                 cls.wrap_listlike,
             ),
             (tuple_iterator, cls.wrap_tuple_iterator),
+            (range_iterator, cls.wrap_range_iterator),
             ((slice, range), cls.wrap_slice_range),
             (tuple(common_constant_types), cls.wrap_literal),
             (re.Pattern, cls.wrap_regex_pattern),
@@ -1312,6 +1316,12 @@ def wrap_tuple_iterator(self, value: tuple_iterator):
 
         return self.set_source_and_track_mutable(value, result)
 
+    def wrap_range_iterator(self, value: range_iterator):
+        self.install_guards(GuardBuilder.TYPE_MATCH)
+        # Get all the values from the range iterator
+        items = [ConstantVariable.create(v) for v in copy.deepcopy(value)]
+        return ListIteratorVariable(items, mutable_local=MutableLocal())
+
     def wrap_slice_range(self, value: Union[slice, range]):
         items = [
             VariableBuilder(self.tx, AttrSource(self.get_source(), k))(

From cfdf658a919506c20805b9d241bb3b765227955f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 23 Oct 2024 13:35:45 -0700
Subject: [PATCH 010/161] [dynamo][modules] Support overridden __call__ on nn
 modules (#138619)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138619
Approved by: https://github.com/williamwen42
ghstack dependencies: #138657
---
 test/dynamo/test_modules.py                   | 25 +++++++++++++++++++
 ...est_get_fqn_to_example_inputs_complex_args |  0
 ...t_get_fqn_to_example_inputs_default_kwargs |  0
 ...tils.test_get_fqn_to_example_inputs_simple |  0
 torch/_dynamo/utils.py                        |  2 ++
 torch/_dynamo/variables/nn_module.py          | 22 +++++++++++++---
 6 files changed, 46 insertions(+), 3 deletions(-)
 delete mode 100644 test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_complex_args
 delete mode 100644 test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_default_kwargs
 delete mode 100644 test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_simple

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 329b04fd7d810..349bc42498e19 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3046,6 +3046,31 @@ def forward(self, x):
         # Must be 3 compilations. If not marked static there would be 2, because strides would be converted to symints.
         self.assertEqual(cnts.frame_count, 3)
 
+    @patch.object(torch._dynamo.config, "inline_inbuilt_nn_modules", True)
+    def test_overridden_call(self):
+        class OverRiddenCallModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def __call__(self, x):
+                # Overrides the __call__ method of torch.nn.Module
+                return 5 * self.forward(x)
+
+            def forward(self, x):
+                return x * 3
+
+        m = OverRiddenCallModule()
+
+        def fn(x):
+            return m(x)
+
+        x = torch.ones(4)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_complex_args b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_complex_args
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_default_kwargs b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_default_kwargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_simple b/test/dynamo_expected_failures/TestUtils.test_get_fqn_to_example_inputs_simple
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index c24c10d7e4e2f..445f4a1fb0ac6 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -120,6 +120,8 @@
 T = TypeVar("T")
 
 unpatched_nn_module_getattr = torch.nn.Module.__getattr__
+unpatched_nn_module_call = torch.nn.Module.__call__
+unpatched_nn_module_call_impl = torch.nn.Module._call_impl
 
 counters: DefaultDict[str, Counter[str]] = collections.defaultdict(collections.Counter)
 optimus_scuba_log: Dict[str, Any] = {}
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 334e526c49756..08c036949a999 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -39,6 +39,8 @@
     object_has_getattribute,
     proxy_args_kwargs,
     set_example_value,
+    unpatched_nn_module_call,
+    unpatched_nn_module_call_impl,
 )
 from .base import MutableLocal, typestr, VariableTracker
 from .functions import invoke_and_store_as_constant
@@ -857,12 +859,26 @@ def call_function(
             if mod.cls_to_become is not None:
                 self.value_type = mod.cls_to_become
             initialize_lazy_module(tx, mod, args, kwargs)
-        name = "_call_impl"
-        fn = getattr(self.value_type, name)
+
+        if (
+            not isinstance(mod, torch.fx.GraphModule)
+            and mod.__call__.__func__ is not unpatched_nn_module_call
+        ):
+            name = "__call__"
+            fn = getattr(self.value_type, name)
+        else:
+            name = "_call_impl"
+            fn = getattr(self.value_type, name)
 
         # Check if we can short circuit nn.Module._call_impl to the forward
         # method.  NB - This is done to reduce the compile time of Dynamo.
-        if fn is torch.nn.Module._call_impl and "forward" not in mod.__dict__:
+        if (
+            istype(mod.__call__, types.MethodType)
+            and istype(mod._call_impl, types.MethodType)
+            and mod.__call__.__func__ is unpatched_nn_module_call
+            and mod._call_impl.__func__ is unpatched_nn_module_call_impl
+            and "forward" not in mod.__dict__
+        ):
             forward_method = inspect.getattr_static(mod, "forward")
             if isinstance(forward_method, types.FunctionType):
                 globals_vt = tx.nn_modules_globals_vt

From 53e356a1c045449942973ad3abe837a85b83b016 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 24 Oct 2024 04:35:18 +0000
Subject: [PATCH 011/161] [2/N] Enable
 cppcoreguidelines-special-member-functions (#138670)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138670
Approved by: https://github.com/sraikund16
---
 aten/src/ATen/CPUApplyUtils.h                 | 14 ++++++---
 c10/util/ThreadLocal.h                        |  3 ++
 c10/util/ThreadLocalDebugInfo.h               |  2 ++
 torch/csrc/autograd/graph_task.h              |  3 ++
 .../csrc/dynamo/python_compiled_autograd.cpp  |  1 +
 torch/csrc/profiler/containers.h              |  4 ++-
 torch/csrc/profiler/kineto_shim.cpp           |  2 --
 torch/csrc/profiler/kineto_shim.h             |  5 ----
 torch/csrc/profiler/stubs/base.cpp            | 29 ++++++++++---------
 torch/csrc/profiler/stubs/base.h              |  2 +-
 torch/csrc/profiler/unwind/communicate.h      |  4 +++
 torch/csrc/profiler/unwind/mem_file.h         |  2 ++
 torch/csrc/profiler/unwind/unwind.cpp         |  5 +++-
 torch/csrc/utils/invalid_arguments.cpp        |  1 +
 torch/csrc/utils/python_dispatch.cpp          |  4 +++
 torch/csrc/utils/tensor_new.cpp               |  8 +++++
 torch/csrc/utils/torch_dispatch_mode.h        | 12 ++++++++
 17 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index c8a735c177544..780510579a7ef 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -64,8 +64,12 @@ struct strided_tensor_iter_fixed {
   int64_t strides_[N] = {0};
 
   strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
-  void operator=(strided_tensor_iter_fixed const& x) = delete;
-  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
+  strided_tensor_iter_fixed& operator=(strided_tensor_iter_fixed const& x) =
+      delete;
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) noexcept = default;
+  strided_tensor_iter_fixed& operator=(strided_tensor_iter_fixed&& x) noexcept =
+      default;
+  ~strided_tensor_iter_fixed() noexcept = default;
   strided_tensor_iter_fixed(
       Tensor& tensor,
       [[maybe_unused]] bool sort_strides = false)
@@ -93,8 +97,10 @@ struct strided_tensor_iter {
   std::vector<int64_t> strides_;
 
   strided_tensor_iter(strided_tensor_iter const&) = delete;
-  void operator=(strided_tensor_iter const& x) = delete;
-  strided_tensor_iter(strided_tensor_iter&&) = default;
+  strided_tensor_iter& operator=(strided_tensor_iter const& x) = delete;
+  strided_tensor_iter(strided_tensor_iter&&) noexcept = default;
+  strided_tensor_iter& operator=(strided_tensor_iter&&) noexcept = default;
+  ~strided_tensor_iter() noexcept = default;
   strided_tensor_iter(Tensor& tensor)
       : data_(tensor.data_ptr<T>()),
         dim_(tensor.ndimension()),
diff --git a/c10/util/ThreadLocal.h b/c10/util/ThreadLocal.h
index 850bb5d4c4269..c6f3d6d874b5c 100644
--- a/c10/util/ThreadLocal.h
+++ b/c10/util/ThreadLocal.h
@@ -115,7 +115,10 @@ class ThreadLocal {
   explicit ThreadLocal(Accessor accessor) : accessor_(accessor) {}
 
   ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal(ThreadLocal&&) noexcept = default;
   ThreadLocal& operator=(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(ThreadLocal&&) noexcept = default;
+  ~ThreadLocal() = default;
 
   Type& get() {
     return *accessor_();
diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index bea8c5f27ac82..3d26dd44f6a52 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -74,6 +74,8 @@ class C10_API DebugInfoGuard {
 
   DebugInfoGuard(const DebugInfoGuard&) = delete;
   DebugInfoGuard(DebugInfoGuard&&) = delete;
+  DebugInfoGuard& operator=(const DebugInfoGuard&) = delete;
+  DebugInfoGuard& operator=(DebugInfoGuard&&) = delete;
 
  private:
   bool active_ = false;
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index e4a7ae4dad18e..018beaffdaaff 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -48,6 +48,9 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
     struct Capture {
       Capture(const Capture&) = delete;
       Capture(Capture&&) = default;
+      Capture& operator=(const Capture&) = delete;
+      Capture& operator=(Capture&&) = default;
+      ~Capture() = default;
 
       Capture(int input_idx, int output_idx)
           : input_idx_(input_idx), output_idx_(output_idx) {}
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index 7a5969fffba16..024603270f787 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -777,6 +777,7 @@ CacheNode* _compiled_autograd_impl(
   return cache;
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 struct LockGuardWithErrorLogs {
   LockGuardWithErrorLogs(std::mutex& mtx) : mtx_(mtx) {
     // Note: the standard allows try_lock to fail spuriously during races for
diff --git a/torch/csrc/profiler/containers.h b/torch/csrc/profiler/containers.h
index 6ff73917d9147..060c6e3b5341d 100644
--- a/torch/csrc/profiler/containers.h
+++ b/torch/csrc/profiler/containers.h
@@ -5,7 +5,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <forward_list>
-#include <new>
 #include <utility>
 
 #include <c10/macros/Macros.h>
@@ -52,7 +51,10 @@ class AppendOnlyList {
 
   AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
   AppendOnlyList(const AppendOnlyList&) = delete;
+  AppendOnlyList(AppendOnlyList&&) = delete;
   AppendOnlyList& operator=(const AppendOnlyList&) = delete;
+  AppendOnlyList& operator=(AppendOnlyList&&) = delete;
+  ~AppendOnlyList() = default;
 
   size_t size() const {
     return n_blocks_ * ChunkSize - (size_t)(end_ - next_);
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index c1c8feea13c45..ef70242eafb35 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -96,8 +96,6 @@ TraceWrapper::TraceWrapper(const int64_t start_time, const std::string& name)
 }
 #endif // USE_KINETO
 
-TraceWrapper::~TraceWrapper() = default;
-
 activity_t* TraceWrapper::addCPUActivity(
     const std::string& name,
     const libkineto::ActivityType type,
diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
index 44509e4a5e64e..085e9dd2fcb2d 100644
--- a/torch/csrc/profiler/kineto_shim.h
+++ b/torch/csrc/profiler/kineto_shim.h
@@ -67,9 +67,6 @@ void addMetadata(
 // Wraps: libkineto::CpuTraceBuffer
 struct TraceWrapper {
   TraceWrapper(const int64_t start_time, const std::string& name);
-  TraceWrapper(TraceWrapper&&) = default;
-  TraceWrapper(const TraceWrapper&) = delete;
-  ~TraceWrapper();
 
   // The caller is expected to hold a mutex when calling `addCPUActivity`.
   activity_t* addCPUActivity(
@@ -96,8 +93,6 @@ struct TraceWrapper {
 struct ActivityTraceWrapper {
   explicit ActivityTraceWrapper(std::unique_ptr<interface_trace_t>&& trace);
   ActivityTraceWrapper() = default;
-  ActivityTraceWrapper(ActivityTraceWrapper&&) = default;
-  ActivityTraceWrapper(const ActivityTraceWrapper&) = delete;
   explicit operator bool() const;
   void save(const std::string& path);
 
diff --git a/torch/csrc/profiler/stubs/base.cpp b/torch/csrc/profiler/stubs/base.cpp
index a5a5dead6fa01..6ee455ca7e97f 100644
--- a/torch/csrc/profiler/stubs/base.cpp
+++ b/torch/csrc/profiler/stubs/base.cpp
@@ -1,28 +1,31 @@
-#include <torch/csrc/profiler/stubs/base.h>
-
+#include <c10/core/Device.h>
 #include <c10/util/Exception.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <cstdint>
+#include <functional>
 
 namespace torch::profiler::impl {
 
-ProfilerStubs::~ProfilerStubs() = default;
-
 namespace {
 struct DefaultStubs : public ProfilerStubs {
-  DefaultStubs(const char* name) : name_{name} {}
+  explicit DefaultStubs(const char* name) : name_{name} {}
 
-  void record(c10::DeviceIndex*, ProfilerVoidEventStub*, int64_t*)
-      const override {
+  void record(
+      c10::DeviceIndex* /*device*/,
+      ProfilerVoidEventStub* /*event*/,
+      int64_t* /*cpu_ns*/) const override {
     fail();
   }
-  float elapsed(const ProfilerVoidEventStub*, const ProfilerVoidEventStub*)
-      const override {
+  float elapsed(
+      const ProfilerVoidEventStub* /*event*/,
+      const ProfilerVoidEventStub* /*event2*/) const override {
     fail();
-    return 0.f;
+    return 0.F;
   }
-  void mark(const char*) const override {
+  void mark(const char* /*name*/) const override {
     fail();
   }
-  void rangePush(const char*) const override {
+  void rangePush(const char* /*name*/) const override {
     fail();
   }
   void rangePop() const override {
@@ -31,7 +34,7 @@ struct DefaultStubs : public ProfilerStubs {
   bool enabled() const override {
     return false;
   }
-  void onEachDevice(std::function<void(int)>) const override {
+  void onEachDevice(std::function<void(int)> /*op*/) const override {
     fail();
   }
   void synchronize() const override {
diff --git a/torch/csrc/profiler/stubs/base.h b/torch/csrc/profiler/stubs/base.h
index c8a0e6cd2ebbe..c64f4e5a6c9e9 100644
--- a/torch/csrc/profiler/stubs/base.h
+++ b/torch/csrc/profiler/stubs/base.h
@@ -33,7 +33,7 @@ struct TORCH_API ProfilerStubs {
   }
   virtual void onEachDevice(std::function<void(int)> op) const = 0;
   virtual void synchronize() const = 0;
-  virtual ~ProfilerStubs();
+  virtual ~ProfilerStubs() = default;
 };
 
 TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
diff --git a/torch/csrc/profiler/unwind/communicate.h b/torch/csrc/profiler/unwind/communicate.h
index 6ace27c543d88..bdaca33b6db2f 100644
--- a/torch/csrc/profiler/unwind/communicate.h
+++ b/torch/csrc/profiler/unwind/communicate.h
@@ -41,6 +41,10 @@ struct Communicate {
       err_ = std::make_unique<std::ostream>(errbuf_.get());
     }
   }
+  Communicate(const Communicate&) = delete;
+  Communicate(Communicate&&) = delete;
+  Communicate& operator=(const Communicate&) = delete;
+  Communicate& operator=(Communicate&&) = delete;
   ~Communicate() {
     close(inpipe_[1]);
     close(outpipe_[0]);
diff --git a/torch/csrc/profiler/unwind/mem_file.h b/torch/csrc/profiler/unwind/mem_file.h
index b5b6807a7bbce..2580e6f6da55a 100644
--- a/torch/csrc/profiler/unwind/mem_file.h
+++ b/torch/csrc/profiler/unwind/mem_file.h
@@ -81,7 +81,9 @@ struct MemFile {
   }
 
   MemFile(const MemFile&) = delete;
+  MemFile(MemFile&&) = delete;
   MemFile& operator=(const MemFile&) = delete;
+  MemFile& operator=(MemFile&&) = delete;
   [[nodiscard]] const char* data() const {
     return (const char*)mem_;
   }
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index db903ca1af729..bed307245822f 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -2,7 +2,6 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/profiler/unwind/unwind.h>
 #include <torch/csrc/utils/cpp_stacktraces.h>
-#include <unordered_map>
 
 #if !defined(__linux__) || !defined(__x86_64__) || !defined(__has_include) || \
     !__has_include("ext/stdio_filebuf.h")
@@ -66,6 +65,10 @@ struct UpgradeExclusive {
     rdlock_.unlock();
     rdlock_.mutex()->lock();
   }
+  UpgradeExclusive(const UpgradeExclusive&) = delete;
+  UpgradeExclusive(UpgradeExclusive&&) = delete;
+  UpgradeExclusive& operator=(const UpgradeExclusive&) = delete;
+  UpgradeExclusive& operator=(UpgradeExclusive&&) = delete;
   ~UpgradeExclusive() {
     rdlock_.mutex()->unlock();
     rdlock_.lock();
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index 4d69870145939..d26f8c2ee1da4 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -116,6 +116,7 @@ struct Option {
   Option(Option&& other) noexcept = default;
   Option& operator=(const Option&) = delete;
   Option& operator=(Option&&) = delete;
+  ~Option() = default;
 
   std::vector<Argument> arguments;
   bool is_variadic;
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 1dad860f6d9f0..c5a659f371da0 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -97,6 +97,10 @@ struct EnableHermeticPyObject {
     c10::impl::tls_set_dispatch_key_included(
         at::DispatchKey::PythonTLSSnapshot, old_python_snapshot_);
   }
+  EnableHermeticPyObject(const EnableHermeticPyObject&) = delete;
+  EnableHermeticPyObject(EnableHermeticPyObject&&) = delete;
+  EnableHermeticPyObject& operator=(const EnableHermeticPyObject&) = delete;
+  EnableHermeticPyObject& operator=(EnableHermeticPyObject&&) = delete;
   bool old_;
   bool old_excluded_python_;
   bool old_python_;
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 5de450c367a66..099991f841480 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -853,6 +853,14 @@ class CheckSparseTensorInvariantsContext {
   ~CheckSparseTensorInvariantsContext() {
     at::globalContext().setCheckSparseTensorInvariants(state);
   }
+  CheckSparseTensorInvariantsContext(
+      const CheckSparseTensorInvariantsContext&) = delete;
+  CheckSparseTensorInvariantsContext(CheckSparseTensorInvariantsContext&&) =
+      delete;
+  CheckSparseTensorInvariantsContext& operator=(
+      const CheckSparseTensorInvariantsContext&) = delete;
+  CheckSparseTensorInvariantsContext& operator=(
+      CheckSparseTensorInvariantsContext&&) = delete;
 
  private:
   bool state;
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
index 2eb8ba7a1cbbb..8fe5404b44a28 100644
--- a/torch/csrc/utils/torch_dispatch_mode.h
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -27,6 +27,12 @@ struct StashTorchDispatchModeGuard {
           std::move(saved_mode_));
     }
   }
+  StashTorchDispatchModeGuard(const StashTorchDispatchModeGuard&) = delete;
+  StashTorchDispatchModeGuard(StashTorchDispatchModeGuard&&) = delete;
+  StashTorchDispatchModeGuard& operator=(const StashTorchDispatchModeGuard&) =
+      delete;
+  StashTorchDispatchModeGuard& operator=(StashTorchDispatchModeGuard&&) =
+      delete;
 
   const std::shared_ptr<c10::impl::PyObject_TorchDispatchMode>& get_cur_mode() {
     return saved_mode_;
@@ -44,6 +50,12 @@ struct StashTorchDispatchStackGuard {
     c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));
     saved_state_ = std::move(old);
   }
+  StashTorchDispatchStackGuard(const StashTorchDispatchStackGuard&) = delete;
+  StashTorchDispatchStackGuard(StashTorchDispatchStackGuard&&) = delete;
+  StashTorchDispatchStackGuard& operator=(const StashTorchDispatchStackGuard&) =
+      delete;
+  StashTorchDispatchStackGuard& operator=(StashTorchDispatchStackGuard&&) =
+      delete;
 
   ~StashTorchDispatchStackGuard() {
     c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));

From 2bcfbf25058d2986fa30b011895b1e835f459848 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Thu, 24 Oct 2024 04:58:49 +0000
Subject: [PATCH 012/161] [Distributed] [17/N] Fix clang-tidy warnings in
 torch/csrc/distributed/ (#138465)

Follows  #137404

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138465
Approved by: https://github.com/ezyang
---
 .../csrc/distributed/c10d/DMAConnectivity.cpp |  9 ++--
 torch/csrc/distributed/c10d/Ops.cpp           |  3 ++
 torch/csrc/distributed/c10d/ProcessGroup.cpp  |  5 ++-
 torch/csrc/distributed/c10d/ProcessGroup.hpp  |  2 +-
 .../distributed/c10d/ProcessGroupGloo.cpp     | 41 +++++++++----------
 .../csrc/distributed/c10d/SymmetricMemory.cpp | 16 ++++----
 .../distributed/c10d/TCPStoreLibUvBackend.cpp |  2 +-
 .../c10d/control_plane/WorkerServer.cpp       |  2 +-
 torch/csrc/distributed/c10d/init.cpp          | 12 +++---
 torch/csrc/distributed/c10d/reducer.cpp       | 12 +++---
 torch/csrc/distributed/c10d/socket.cpp        |  4 +-
 11 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/DMAConnectivity.cpp
index d920eb567197f..50c34f62426eb 100644
--- a/torch/csrc/distributed/c10d/DMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/DMAConnectivity.cpp
@@ -1,10 +1,11 @@
 #include <torch/csrc/distributed/c10d/DMAConnectivity.hpp>
+#include <utility>
 
 namespace {
 
 std::string get_detector_key(
     c10::DeviceType device_type,
-    std::string connection_type) {
+    const std::string& connection_type) {
   std::ostringstream oss;
   oss << device_type << "/" << connection_type;
   return oss.str();
@@ -12,6 +13,8 @@ std::string get_detector_key(
 
 class DetectorMap {
  public:
+  DetectorMap(const DetectorMap&) = delete;
+  DetectorMap& operator=(const DetectorMap&) = delete;
   static DetectorMap& get() {
     static DetectorMap instance;
     return instance;
@@ -52,8 +55,6 @@ class DetectorMap {
 
  private:
   DetectorMap() = default;
-  DetectorMap(const DetectorMap&) = delete;
-  DetectorMap& operator=(const DetectorMap&) = delete;
 
   std::unordered_map<
       std::string,
@@ -73,7 +74,7 @@ DMAConnectivity::DMAConnectivity(
     std::string connection_type,
     std::vector<std::vector<int>> matrix)
     : device_type(device_type),
-      connection_type(connection_type),
+      connection_type(std::move(connection_type)),
       matrix(std::move(matrix)) {}
 
 void register_dma_connectivity_detector(
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index ae822ad397504..6251bfa1817dd 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -427,6 +427,7 @@ IMPL_ALLTOALL_BASE(CPU)
 IMPL_ALLTOALL_BASE(CUDA)
 IMPL_ALLTOALL_BASE(PrivateUse1)
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
 #define IMPL_BARRIER(DEV)                                                    \
   c10::intrusive_ptr<Work> barrier##DEV(                                     \
       at::Tensor /* unused */,                                               \
@@ -441,9 +442,11 @@ IMPL_ALLTOALL_BASE(PrivateUse1)
 IMPL_BARRIER(CPU)
 IMPL_BARRIER(CUDA)
 IMPL_BARRIER(PrivateUse1)
+// NOLINTEND(performance-unnecessary-value-param)
 // NOLINTEND(cppcoreguidelines-pro-type-const-cast)
 
 void monitored_barrier_CPU(
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     at::Tensor /* unused */,
     const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 199b799dbe529..dffe20aebdd90 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -12,6 +12,7 @@
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp>
+#include <utility>
 
 namespace c10d {
 
@@ -102,10 +103,10 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
 }
 
 ProcessGroup::ProcessGroup(
-    const c10::intrusive_ptr<::c10d::Store>& store,
+    c10::intrusive_ptr<::c10d::Store> store,
     int rank,
     int size)
-    : store_(store),
+    : store_(std::move(store)),
       rank_(rank),
       size_(size),
       backendType_(BackendType::UNDEFINED),
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index b74b650400e23..febf885a112b3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -105,7 +105,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
   explicit ProcessGroup(int rank, int size);
 
   explicit ProcessGroup(
-      const c10::intrusive_ptr<::c10d::Store>& store,
+      c10::intrusive_ptr<::c10d::Store> store,
       int rank,
       int size);
   ~ProcessGroup() override;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 1931b23c4f15d..8ac81f4c396bd 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -654,7 +654,6 @@ void socketInitialize() {
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
   socketInitialize();
   struct addrinfo hints {};
-  memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
   hints.ai_socktype = SOCK_STREAM;
   struct addrinfo* result = nullptr;
@@ -876,7 +875,7 @@ namespace {
 class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncBroadcastWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       int rootRank,
       int rootTensor,
@@ -888,7 +887,7 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:broadcast",
             inputs),
-        context(context),
+        context(std::move(context)),
         inputs(inputs),
         rootRank(rootRank),
         rootTensor(rootTensor),
@@ -1025,7 +1024,7 @@ namespace {
 class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllreduceWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
@@ -1036,7 +1035,7 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:all_reduce",
             inputs),
-        context(context),
+        context(std::move(context)),
         inputs(inputs),
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
@@ -1109,7 +1108,7 @@ class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
 class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncSparseAllreduceWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
       uint64_t seq)
@@ -1119,7 +1118,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:sparse_all_reduce",
             inputs),
-        context(context),
+        context(std::move(context)),
         inputs(inputs),
         tag(tag) {}
 
@@ -1626,7 +1625,7 @@ namespace {
 class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncReduceWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       int rootRank,
       int rootTensor,
@@ -1639,7 +1638,7 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:reduce",
             inputs),
-        context(context),
+        context(std::move(context)),
         inputs(inputs),
         rootRank(rootRank),
         rootTensor(rootTensor),
@@ -1804,7 +1803,7 @@ namespace {
 class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllgatherWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       uint32_t tag,
@@ -1815,7 +1814,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:all_gather",
             inputs),
-        context(context),
+        context(std::move(context)),
         outputs(outputs),
         inputs(inputs),
         tag(tag) {}
@@ -2076,7 +2075,7 @@ namespace {
 class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllgatherCoalescedWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<std::vector<at::Tensor>>& output_lists,
       std::vector<at::Tensor>& input_list,
       uint32_t tag,
@@ -2087,7 +2086,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:all_gather",
             input_list),
-        context(context),
+        context(std::move(context)),
         output_lists(output_lists),
         input_list(input_list),
         tag(tag) {}
@@ -2218,7 +2217,7 @@ namespace {
 class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncGatherWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<std::vector<at::Tensor>>& outputs,
       std::vector<at::Tensor>& inputs,
       int root,
@@ -2230,7 +2229,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:gather",
             inputs),
-        context(context),
+        context(std::move(context)),
         outputs(outputs),
         inputs(inputs),
         root(root),
@@ -2423,7 +2422,7 @@ namespace {
 class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncScatterWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& outputs,
       std::vector<std::vector<at::Tensor>>& inputs,
       int root,
@@ -2436,7 +2435,7 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
             "gloo:scatter",
             !inputs.empty() ? std::optional<std::vector<at::Tensor>>(inputs[0])
                             : std::nullopt),
-        context(context),
+        context(std::move(context)),
         outputs(outputs),
         inputs(inputs),
         root(root),
@@ -2618,7 +2617,7 @@ namespace {
 class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAlltoallWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,
       std::vector<int64_t>& outputCounts,
@@ -2631,7 +2630,7 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:all_to_all",
             std::optional<std::vector<at::Tensor>>({inputTensor})),
-        context(context),
+        context(std::move(context)),
         outputTensor(outputTensor),
         inputTensor(inputTensor),
         outputCounts(std::move(outputCounts)),
@@ -2889,7 +2888,7 @@ namespace {
 class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncBarrierWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork,
       uint32_t tag,
       uint64_t seq)
@@ -2899,7 +2898,7 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
             seq,
             "gloo:barrier",
             std::nullopt),
-        context(context),
+        context(std::move(context)),
         priorWork(std::move(priorWork)),
         tag(tag) {}
 
diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/SymmetricMemory.cpp
index 6b4e89ef60a97..7911a9d875b3a 100644
--- a/torch/csrc/distributed/c10d/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/SymmetricMemory.cpp
@@ -8,6 +8,8 @@ static bool is_finalizing_ = false;
 
 class AllocatorMap {
  public:
+  AllocatorMap(const AllocatorMap&) = delete;
+  AllocatorMap& operator=(const AllocatorMap&) = delete;
   static AllocatorMap& get() {
     static AllocatorMap instance;
     return instance;
@@ -35,8 +37,6 @@ class AllocatorMap {
 
  private:
   AllocatorMap() = default;
-  AllocatorMap(const AllocatorMap&) = delete;
-  AllocatorMap& operator=(const AllocatorMap&) = delete;
 
   std::unordered_map<
       c10::DeviceType,
@@ -74,7 +74,8 @@ static at::Tensor empty_strided_p2p_persistent(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
-      static_cast<size_t>(1),
+      size_t(1),
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
   const size_t alloc_size = numel * element_size;
@@ -108,8 +109,7 @@ static at::Tensor empty_strided_p2p_persistent(
 
 } // namespace
 
-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {
 
 bool is_finalizing() {
   return is_finalizing_;
@@ -162,7 +162,8 @@ at::Tensor empty_strided_p2p(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
-      static_cast<size_t>(1),
+      size_t(1),
+      // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
   const size_t alloc_size = numel * element_size;
@@ -201,5 +202,4 @@ TORCH_API bool has_multicast_support(
   auto allocator = get_allocator(device_type);
   return allocator->has_multicast_support(device_idx);
 }
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index c3fa09ab38bef..b5f4a8e547e22 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -780,7 +780,7 @@ class UvClient : public UvTcpSocket {
   }
 
   bool parse_ping_command() {
-    uint32_t nonce;
+    uint32_t nonce = 0;
     if (!stream.read_value(nonce)) {
       return false;
     }
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 047459b965589..e4a2d301a5661 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -83,7 +83,7 @@ bool file_exists(const std::string& path) {
 #ifdef _WIN32
   return std::filesystem::exists(path);
 #else
-  struct stat rc;
+  struct stat rc {};
   return lstat(path.c_str(), &rc) == 0;
 #endif
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 84bb0133a11bf..f613bf0245502 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -96,11 +96,11 @@ class IntrusivePtrNoGilDestructor {
  public:
   IntrusivePtrNoGilDestructor() = default;
   IntrusivePtrNoGilDestructor(const IntrusivePtrNoGilDestructor&) = default;
-  IntrusivePtrNoGilDestructor(IntrusivePtrNoGilDestructor&&) = default;
+  IntrusivePtrNoGilDestructor(IntrusivePtrNoGilDestructor&&) noexcept = default;
   IntrusivePtrNoGilDestructor& operator=(const IntrusivePtrNoGilDestructor&) =
       default;
-  IntrusivePtrNoGilDestructor& operator=(IntrusivePtrNoGilDestructor&&) =
-      default;
+  IntrusivePtrNoGilDestructor& operator=(
+      IntrusivePtrNoGilDestructor&&) noexcept = default;
   /* implicit */ IntrusivePtrNoGilDestructor(c10::intrusive_ptr<T> impl)
       : impl_(std::move(impl)) {}
   // This ctor is very important; see
@@ -909,7 +909,7 @@ This class does not support ``__members__`` property.)");
       "_register_process_group",
       [](const std::string& group_name,
          c10::intrusive_ptr<::c10d::ProcessGroup> group) {
-        ::c10d::register_process_group(group_name, std::move(group));
+        ::c10d::register_process_group(group_name, group);
       },
       py::arg("group_name"),
       py::arg("group"));
@@ -928,7 +928,7 @@ This class does not support ``__members__`` property.)");
          const c10::intrusive_ptr<::c10d::Work>& work) {
         dynamic_cast<::c10d::PyProcessGroup::PyWork*>(work.get())
             ->ref_py_object();
-        ::c10d::register_work(tensor, std::move(work));
+        ::c10d::register_work(tensor, work);
       },
       py::arg("tensor"),
       py::arg("work"));
@@ -2179,7 +2179,7 @@ communication mechanism.
                 // python-related libs.
                 self->registerOnCompletionHook(
                     [hookWrapper = ::c10d::PythonOnCompletionHook(std::move(
-                         hook))](std::shared_ptr<::c10d::WorkInfo> workInfo) {
+                         hook))](const std::shared_ptr<::c10d::WorkInfo>& workInfo) {
                       hookWrapper(workInfo);
                     });
               },
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index c0d9863f6cdf3..21b0c4acff19f 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1698,7 +1698,7 @@ void Reducer::runGradCallbackForVariable(
     cb(variable.mutable_grad());
   } else {
     // Under distributed autograd
-    context_ptr->runGradCallbackForVariable(variable, std::move(cb));
+    context_ptr->runGradCallbackForVariable(variable, cb);
   }
 #endif
 }
@@ -1759,15 +1759,17 @@ void Reducer::sync_bucket_indices(
   num_buckets = indices_accessor[indices_accessor_Index];
 
   // Broadcast bucket_sizes
-  auto bucket_sizes_tensor = at::empty({(int64_t)num_buckets}, at::kInt);
+  auto bucket_sizes_tensor =
+      at::empty({static_cast<int64_t>(num_buckets)}, at::kInt);
   auto bucket_sizes_accessor = bucket_sizes_tensor.accessor<int, 1>();
   for (const auto i : c10::irange(num_buckets)) {
     // For rank != 0, it is possible that local num buckets bucket_sizes.size()
     // is smaller than broadcasted num_buckets
-    bucket_sizes_accessor[i] =
-        bucket_sizes.at(std::min(i, (bucket_sizes.size() - 1)));
+    bucket_sizes_accessor[static_cast<int64_t>(i)] = static_cast<int>(
+        bucket_sizes.at(std::min(i, (bucket_sizes.size() - 1))));
   }
-  auto bucket_sizes_tensor_device = at::empty({(int64_t)num_buckets}, options);
+  auto bucket_sizes_tensor_device =
+      at::empty({static_cast<int64_t>(num_buckets)}, options);
   bucket_sizes_tensor_device.copy_(bucket_sizes_tensor, /*non_blocking=*/true);
   std::vector<at::Tensor> bucket_sizes_tensor_list = {
       bucket_sizes_tensor_device};
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index f219369541ca2..db76a1eb284e5 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -208,7 +208,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
       struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
       char ip[INET_ADDRSTRLEN];
       if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
-          NULL) {
+          nullptr) {
         return fmt::format("{}:{}", ip, psai->sin_port);
       }
     } else if (addr->sa_family == AF_INET6) {
@@ -216,7 +216,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
       char ip[INET6_ADDRSTRLEN];
       if (inet_ntop(
               addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
-          NULL) {
+          nullptr) {
         return fmt::format("[{}]:{}", ip, psai->sin6_port);
       }
     }

From a94c501b8409670ffb7ec2e3e060542142fc8f86 Mon Sep 17 00:00:00 2001
From: chilli <chilli@meta.com>
Date: Wed, 23 Oct 2024 14:52:42 -0700
Subject: [PATCH 013/161] Fixed max-autotune in FlexAttention to reset kernel
 options appropriately (#138733)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138733
Approved by: https://github.com/drisspg, https://github.com/BoyuanFeng
---
 torch/_inductor/kernel/flex_attention.py | 30 +++++++++++++-----------
 torch/_inductor/kernel/flex_decoding.py  |  8 ++++---
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
index 4c4fa60d04b0f..2e8935cfed6b8 100644
--- a/torch/_inductor/kernel/flex_attention.py
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -851,7 +851,7 @@ def flex_attention(
     # Note, we don't need to pass in the captured buffers explicitly
     # because they're implicitly added by the score_mod function
     # We do need to explicitly pass it in for autotuning though.
-
+    original_kernel_options = kernel_options.copy()
     for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
         if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0 or SPARSE_Q_BLOCK_SIZE % BLOCK_M != 0:
             continue
@@ -859,12 +859,13 @@ def flex_attention(
         if num_stages == 2:
             continue
 
+        cur_kernel_options = original_kernel_options.copy()
         # Performance tuning
-        kernel_options.setdefault("BLOCK_M", BLOCK_M)
-        kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        cur_kernel_options.setdefault("BLOCK_M", BLOCK_M)
+        cur_kernel_options.setdefault("BLOCK_N", BLOCK_N)
         # Blocksparse options
-        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
-        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
 
         flex_attention_template.maybe_append_choice(
             choices=choices,
@@ -889,7 +890,7 @@ def flex_attention(
             num_stages=num_stages,
             num_warps=num_warps,
             call_sizes=query.get_size(),
-            **kernel_options,
+            **cur_kernel_options,
         )
     inputs_for_autotuning = (
         [
@@ -1782,7 +1783,7 @@ def flex_attention_backward(*args, **kwargs):
                 if BLOCK2 % BLOCK1 == 0
             ]
         )
-
+    original_kernel_options = kernel_options.copy()
     for BLOCK1, BLOCK2, num_warps, num_stages in configs:
         if (
             SPARSE_KV_BLOCK_SIZE % BLOCK1 != 0
@@ -1793,13 +1794,14 @@ def flex_attention_backward(*args, **kwargs):
             continue
 
         # Performance tuning
-        kernel_options.setdefault("BLOCK_M1", BLOCK1)
-        kernel_options.setdefault("BLOCK_N1", BLOCK2)
-        kernel_options.setdefault("BLOCK_M2", BLOCK2)
-        kernel_options.setdefault("BLOCK_N2", BLOCK1)
+        cur_kernel_options = original_kernel_options.copy()
+        cur_kernel_options.setdefault("BLOCK_M1", BLOCK1)
+        cur_kernel_options.setdefault("BLOCK_N1", BLOCK2)
+        cur_kernel_options.setdefault("BLOCK_M2", BLOCK2)
+        cur_kernel_options.setdefault("BLOCK_N2", BLOCK1)
         # Blocksparse options
-        kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
-        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
 
         flex_attention_backward_template.maybe_append_choice(
             choices=choices,
@@ -1827,7 +1829,7 @@ def flex_attention_backward(*args, **kwargs):
             call_sizes=query.get_size() + key.get_size()[1:3],
             num_stages=num_stages,
             num_warps=num_warps,
-            **kernel_options,
+            **cur_kernel_options,
         )
     inputs_for_autotuning = (
         [
diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
index 291a78eeb4c27..7b1c3466b1290 100644
--- a/torch/_inductor/kernel/flex_decoding.py
+++ b/torch/_inductor/kernel/flex_decoding.py
@@ -479,6 +479,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
     # Mark SPARSE_KV_BLOCK_SIZE as static shapes and add guards.
     SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.evaluate_static_shape(SPARSE_KV_BLOCK_SIZE)
 
+    original_kernel_options = kernel_options.copy()
     # Note, we don't need to pass in the captured buffers explicitly
     # because they're implicitly added by the score_mod function
     # We do need to explicitly pass it in for autotuning though.
@@ -486,9 +487,10 @@ def create_flex_decoding_kernel(*args, **kwargs):
         if SPARSE_KV_BLOCK_SIZE % BLOCK_N != 0:
             continue
 
+        cur_kernel_options = original_kernel_options.copy()
         # Performance tuning
-        kernel_options.setdefault("BLOCK_N", BLOCK_N)
-        kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
+        cur_kernel_options.setdefault("BLOCK_N", BLOCK_N)
+        cur_kernel_options.setdefault("SPARSE_KV_BLOCK_SIZE", SPARSE_KV_BLOCK_SIZE)
 
         # Work around https://github.com/pytorch/pytorch/issues/129625
         if num_stages == 2:
@@ -515,7 +517,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
             num_stages=num_stages,
             num_warps=num_warps,
             call_sizes=query.get_size(),
-            **kernel_options,
+            **cur_kernel_options,
         )
 
     inputs_for_flex_decoding = (

From 96b30dcb25c80513769dae2a8688aec080b00117 Mon Sep 17 00:00:00 2001
From: Xu Han <xu.han@outlook.com>
Date: Thu, 24 Oct 2024 05:29:47 +0000
Subject: [PATCH 014/161] [Windows][cpu] mkl use mimalloc as allocator on
 Windows (#138419)

We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion
From the blog conclusion, we found the `ResNet50` is typical case of it.

Let's focus on the `ResNet50`, and collect the profiling log:
```cmd
(nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
                  model_inference         3.91%     682.427ms       100.00%       17.448s       17.448s             1
                     aten::conv2d         0.18%      30.906ms        64.79%       11.305s       2.133ms          5300
                aten::convolution         0.45%      78.031ms        64.62%       11.275s       2.127ms          5300
               aten::_convolution         0.30%      51.670ms        64.17%       11.196s       2.113ms          5300
         aten::mkldnn_convolution        63.58%       11.093s        63.87%       11.145s       2.103ms          5300
                 aten::batch_norm         0.13%      23.536ms        20.10%        3.506s     661.580us          5300
     aten::_batch_norm_impl_index         0.28%      49.486ms        19.96%        3.483s     657.139us          5300
          aten::native_batch_norm        19.26%        3.360s        19.64%        3.427s     646.615us          5300
                 aten::max_pool2d         0.01%       1.038ms         5.84%        1.018s      10.181ms           100
    aten::max_pool2d_with_indices         5.83%        1.017s         5.83%        1.017s      10.171ms           100
                       aten::add_         3.38%     588.907ms         3.38%     588.907ms      85.349us          6900
                      aten::relu_         0.35%      60.358ms         1.67%     292.155ms      59.624us          4900
                 aten::clamp_min_         1.33%     231.797ms         1.33%     231.797ms      47.306us          4900
                      aten::empty         0.46%      80.195ms         0.46%      80.195ms       1.513us         53000
                     aten::linear         0.01%     927.300us         0.23%      39.353ms     393.532us           100
                      aten::addmm         0.20%      35.379ms         0.21%      37.016ms     370.155us           100
                 aten::empty_like         0.12%      20.455ms         0.17%      29.976ms       5.656us          5300
                aten::as_strided_         0.11%      18.830ms         0.11%      18.830ms       3.553us          5300
        aten::adaptive_avg_pool2d         0.00%     419.900us         0.08%      14.265ms     142.647us           100
                       aten::mean         0.01%       1.737ms         0.08%      13.845ms     138.448us           100
                        aten::sum         0.05%       8.113ms         0.05%       8.648ms      86.479us           100
                    aten::resize_         0.03%       5.182ms         0.03%       5.182ms       0.978us          5300
                       aten::div_         0.01%       1.445ms         0.02%       3.460ms      34.600us           100
                         aten::to         0.00%     337.000us         0.01%       2.015ms      20.154us           100
                   aten::_to_copy         0.01%     977.500us         0.01%       1.678ms      16.784us           100
                      aten::copy_         0.01%       1.474ms         0.01%       1.474ms       7.371us           200
                          aten::t         0.00%     775.900us         0.01%       1.410ms      14.104us           100
                    aten::flatten         0.00%     420.900us         0.01%       1.311ms      13.106us           100
                       aten::view         0.01%     889.700us         0.01%     889.700us       8.897us           100
                  aten::transpose         0.00%     410.700us         0.00%     634.500us       6.345us           100
                     aten::expand         0.00%     496.800us         0.00%     566.800us       5.668us           100
                      aten::fill_         0.00%     534.800us         0.00%     534.800us       5.348us           100
                 aten::as_strided         0.00%     293.800us         0.00%     293.800us       1.469us           200
              aten::empty_strided         0.00%     241.700us         0.00%     241.700us       2.417us           100
               aten::resolve_conj         0.00%      54.800us         0.00%      54.800us       0.274us           200
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 17.448s

Execution time: 20.02380895614624
```
We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`.
Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory.
We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory.

So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html

This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes:
1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`.
2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`.
3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`.

For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 CMakeLists.txt                                |  6 ++++
 .../ATen/native/mkl/MklAllocationHelper.cpp   | 29 +++++++++++++++++++
 c10/core/impl/alloc_cpu.cpp                   | 23 +++++++++++++++
 c10/core/impl/alloc_cpu.h                     | 10 +++++++
 cmake/Summary.cmake                           |  3 ++
 5 files changed, 71 insertions(+)
 create mode 100644 aten/src/ATen/native/mkl/MklAllocationHelper.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13fc95364f207..30377996b3e39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -381,8 +381,10 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
 # on Windows.
+option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
 if(WIN32)
   set(USE_MIMALLOC ON)
+  set(USE_MIMALLOC_ON_MKL ON)
 endif()
 
 if(USE_CCACHE)
@@ -1237,6 +1239,10 @@ if(USE_MIMALLOC)
   include_directories(third_party/mimalloc/include)
 endif()
 
+if(USE_MIMALLOC AND USE_MIMALLOC_ON_MKL)
+  add_definitions(-DUSE_MIMALLOC_ON_MKL)
+endif()
+
 # ---[ Main build
 add_subdirectory(c10)
 add_subdirectory(caffe2)
diff --git a/aten/src/ATen/native/mkl/MklAllocationHelper.cpp b/aten/src/ATen/native/mkl/MklAllocationHelper.cpp
new file mode 100644
index 0000000000000..3ac062fb99776
--- /dev/null
+++ b/aten/src/ATen/native/mkl/MklAllocationHelper.cpp
@@ -0,0 +1,29 @@
+#include <ATen/Config.h>
+
+#if AT_MKLDNN_ENABLED()
+#ifdef USE_MIMALLOC_ON_MKL
+#include <c10/core/impl/alloc_cpu.h>
+#include <mkl.h>
+#if INTEL_MKL_VERSION > 20230000L
+/*
+MKL have a method to register memory allocation APIs via i_malloc.h, High
+performance memory allocation APIs will help improve MKL performance.
+Please check MKL online document：
+https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2024-2/redefining-memory-functions.html
+*/
+#include <i_malloc.h>
+
+bool register_mimalloc_api_to_mkl()
+{
+    i_malloc  = c10::mi_malloc_wrapper::c10_mi_malloc;
+    i_calloc  = c10::mi_malloc_wrapper::c10_mi_calloc;
+    i_realloc = c10::mi_malloc_wrapper::c10_mi_realloc;
+    i_free    = c10::mi_malloc_wrapper::c10_mi_free;
+
+    return true;
+}
+
+static bool g_b_registered_mkl_alloction = register_mimalloc_api_to_mkl();
+#endif
+#endif
+#endif
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index 31b7489688c2e..f976e7b745e21 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -163,4 +163,27 @@ void free_cpu(void* data) {
 #endif
 }
 
+#ifdef USE_MIMALLOC_ON_MKL
+namespace mi_malloc_wrapper {
+void* c10_mi_malloc(size_t size) {
+  return mi_malloc(size);
+}
+
+void* c10_mi_calloc(size_t count, size_t size) {
+  return mi_calloc(count, size);
+}
+
+void* c10_mi_realloc(void* p, size_t newsize) {
+  return mi_realloc(p, newsize);
+}
+
+void* c10_mi_malloc_aligned(size_t size, size_t alignment) {
+  return mi_malloc_aligned(size, alignment);
+}
+
+void c10_mi_free(void* p) {
+  mi_free(p);
+}
+} // namespace mi_malloc_wrapper
+#endif
 } // namespace c10
diff --git a/c10/core/impl/alloc_cpu.h b/c10/core/impl/alloc_cpu.h
index ee32a0f463068..8d506acf392f4 100644
--- a/c10/core/impl/alloc_cpu.h
+++ b/c10/core/impl/alloc_cpu.h
@@ -9,4 +9,14 @@ namespace c10 {
 C10_API void* alloc_cpu(size_t nbytes);
 C10_API void free_cpu(void* data);
 
+#ifdef USE_MIMALLOC_ON_MKL
+namespace mi_malloc_wrapper {
+C10_API void* c10_mi_malloc(size_t size);
+C10_API void* c10_mi_calloc(size_t count, size_t size);
+C10_API void* c10_mi_realloc(void* p, size_t newsize);
+C10_API void* c10_mi_malloc_aligned(size_t size, size_t alignment);
+C10_API void c10_mi_free(void* p);
+} // namespace mi_malloc_wrapper
+#endif
+
 } // namespace c10
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index d51c451589c2c..3f70465c91d6d 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -163,6 +163,9 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_OPENCL            : ${USE_OPENCL}")
   message(STATUS "  USE_OPENMP            : ${USE_OPENMP}")
   message(STATUS "  USE_MIMALLOC          : ${USE_MIMALLOC}")
+  if(${USE_MIMALLOC})
+    message(STATUS "    USE_MIMALLOC_ON_MKL   : ${USE_MIMALLOC_ON_MKL}")
+  endif()
   message(STATUS "  USE_VULKAN            : ${USE_VULKAN}")
   if(${USE_VULKAN})
     message(STATUS "    USE_VULKAN_FP16_INFERENCE    : ${USE_VULKAN_FP16_INFERENCE}")

From dbf0fa811acf4c892149e09da6890cf584a3ffa1 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Thu, 24 Oct 2024 07:51:02 +0000
Subject: [PATCH 015/161] Remove C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA and
 CONSTEXPR_EXCEPT_WIN_CUDA (#138479)

BC linter suppressed due to removal of `tools/linter/adapters/constexpr_linter.py`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138479
Approved by: https://github.com/eqy, https://github.com/malfet
---
 .lintrunner.toml                              | 11 ---
 aten/src/ATen/native/cuda/AbsKernel.cu        |  2 +-
 .../ATen/native/cuda/BinaryDivTrueKernel.cu   |  2 +-
 .../native/cuda/BinaryLogicalOpsKernels.cu    |  6 +-
 .../cuda/BinaryMiscBackwardOpsKernels.cu      |  4 +-
 aten/src/ATen/native/cuda/BinaryMulKernel.cu  |  2 +-
 aten/src/ATen/native/cuda/GcdLcmKernel.cu     |  4 +-
 aten/src/ATen/native/cuda/Lerp.cu             |  4 +-
 .../ATen/native/cuda/PointwiseOpsKernel.cu    |  4 +-
 aten/src/ATen/native/cuda/PowKernel.cu        |  4 +-
 .../ATen/native/cuda/ReduceSumProdKernel.cu   |  6 +-
 .../ATen/native/cuda/UnaryComplexKernels.cu   |  4 +-
 .../src/ATen/native/cuda/UnaryGammaKernels.cu |  8 +-
 .../native/cuda/UnaryGeometricAcosKernel.cu   |  2 +-
 .../native/cuda/UnaryGeometricAcoshKernel.cu  |  2 +-
 .../native/cuda/UnaryGeometricAsinKernel.cu   |  2 +-
 .../native/cuda/UnaryGeometricAsinhKernel.cu  |  2 +-
 .../native/cuda/UnaryGeometricAtanKernel.cu   |  2 +-
 .../native/cuda/UnaryGeometricAtanhKernel.cu  |  2 +-
 .../native/cuda/UnaryGeometricCosKernel.cu    |  2 +-
 .../native/cuda/UnaryGeometricCoshKernel.cu   |  2 +-
 .../native/cuda/UnaryGeometricSinKernel.cu    |  2 +-
 .../native/cuda/UnaryGeometricSinhKernel.cu   |  2 +-
 .../native/cuda/UnaryGeometricTanKernel.cu    |  2 +-
 .../native/cuda/UnaryGeometricTanhKernel.cu   |  2 +-
 aten/src/ATen/native/cuda/UnaryLogKernels.cu  |  6 +-
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   |  6 +-
 aten/src/ATen/native/cuda/UnarySignKernels.cu |  4 +-
 .../ATen/native/cuda/UnarySpecialOpsKernel.cu | 28 +++---
 aten/src/ATen/native/cuda/ZetaKernel.cu       |  2 +-
 aten/src/ATen/native/cuda/airy_ai.cu          |  2 +-
 aten/src/ATen/native/cuda/bessel_j0.cu        |  2 +-
 aten/src/ATen/native/cuda/bessel_j1.cu        |  2 +-
 aten/src/ATen/native/cuda/bessel_y0.cu        |  2 +-
 aten/src/ATen/native/cuda/bessel_y1.cu        |  2 +-
 .../native/cuda/chebyshev_polynomial_t.cu     |  2 +-
 .../native/cuda/chebyshev_polynomial_u.cu     |  2 +-
 .../native/cuda/chebyshev_polynomial_v.cu     |  2 +-
 .../native/cuda/chebyshev_polynomial_w.cu     |  2 +-
 .../ATen/native/cuda/hermite_polynomial_h.cu  |  2 +-
 .../ATen/native/cuda/hermite_polynomial_he.cu |  2 +-
 .../ATen/native/cuda/laguerre_polynomial_l.cu |  2 +-
 .../ATen/native/cuda/modified_bessel_i0.cu    |  2 +-
 .../ATen/native/cuda/modified_bessel_i1.cu    |  2 +-
 .../ATen/native/cuda/modified_bessel_k0.cu    |  2 +-
 .../ATen/native/cuda/modified_bessel_k1.cu    |  2 +-
 .../native/cuda/scaled_modified_bessel_k0.cu  |  2 +-
 .../native/cuda/scaled_modified_bessel_k1.cu  |  2 +-
 .../cuda/shifted_chebyshev_polynomial_t.cu    |  2 +-
 .../cuda/shifted_chebyshev_polynomial_u.cu    |  2 +-
 .../cuda/shifted_chebyshev_polynomial_v.cu    |  2 +-
 .../cuda/shifted_chebyshev_polynomial_w.cu    |  2 +-
 .../ATen/native/cuda/spherical_bessel_j0.cu   |  2 +-
 c10/macros/Macros.h                           | 58 +----------
 c10/util/ArrayRef.h                           | 15 ++-
 c10/util/ConstexprCrc.h                       | 11 ++-
 c10/util/typeid.h                             |  4 +-
 tools/linter/adapters/constexpr_linter.py     | 96 -------------------
 torch/csrc/jit/api/module.h                   |  8 +-
 torch/csrc/jit/serialization/pickler.h        |  2 +-
 torchgen/gen.py                               | 10 +-
 61 files changed, 109 insertions(+), 272 deletions(-)
 delete mode 100644 tools/linter/adapters/constexpr_linter.py

diff --git a/.lintrunner.toml b/.lintrunner.toml
index fcf859a862d5a..d82ee315e73a5 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -376,17 +376,6 @@ command = [
 ]
 is_formatter = true
 
-[[linter]]
-code = 'CONSTEXPR'
-include_patterns=['aten/src/ATen/native/cuda/*.cu']
-command = [
-    'python3',
-    'tools/linter/adapters/constexpr_linter.py',
-    '--',
-    '@{{PATHSFILE}}',
-]
-is_formatter = true
-
 [[linter]]
 code = 'SPACES'
 include_patterns = ['**']
diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 980bd6637341e..e2c0a456a232b 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -15,7 +15,7 @@ struct AbsFunctor {
   }
 };
 
-CONSTEXPR_EXCEPT_WIN_CUDA char abs_name[] = "abs_kernel";
+constexpr char abs_name[] = "abs_kernel";
 void abs_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu b/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
index aa955a9c7e546..a7fa53fcb0abd 100644
--- a/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryDivTrueKernel.cu
@@ -16,7 +16,7 @@
 namespace at::native {
 namespace binary_internal {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char div_name[] = "div_kernel";
+constexpr char div_name[] = "div_kernel";
 void div_true_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (iter.common_dtype() == kComplexHalf) {
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index eaa01ac1accc8..918a6ba4e981e 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -11,7 +11,7 @@
 
 namespace at::native {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char logical_and_name[] = "logical_and_kernel";
+constexpr char logical_and_name[] = "logical_and_kernel";
 void logical_and_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -48,7 +48,7 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char logical_or_name[] = "logical_or_kernel";
+constexpr char logical_or_name[] = "logical_or_kernel";
 void logical_or_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
@@ -84,7 +84,7 @@ void logical_or_kernel_cuda(TensorIterator& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char logical_xor_name[] = "logical_xor_kernel";
+constexpr char logical_xor_name[] = "logical_xor_kernel";
 void logical_xor_kernel_cuda(TensorIterator& iter) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index 75d5991f93db5..0cd4c5040fe70 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -15,7 +15,7 @@
 
 namespace at::native {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_backward_name[] = "sigmoid_backward";
+constexpr char sigmoid_backward_name[] = "sigmoid_backward";
 void sigmoid_backward_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if(isComplexType(dtype)) {
@@ -86,7 +86,7 @@ void logit_backward_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scal
       });
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char tanh_backward_name[] = "tanh_backward";
+constexpr char tanh_backward_name[] = "tanh_backward";
 void tanh_backward_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if(isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/BinaryMulKernel.cu b/aten/src/ATen/native/cuda/BinaryMulKernel.cu
index 251221f7adcd1..242ff1c7cd52e 100644
--- a/aten/src/ATen/native/cuda/BinaryMulKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulKernel.cu
@@ -18,7 +18,7 @@
 
 namespace at::native {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char mul_name[] = "mul_kernel";
+constexpr char mul_name[] = "mul_kernel";
 void mul_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (common_dtype == kComplexHalf) {
diff --git a/aten/src/ATen/native/cuda/GcdLcmKernel.cu b/aten/src/ATen/native/cuda/GcdLcmKernel.cu
index c4a8cdfaf1f8e..6b003a6f4fc03 100644
--- a/aten/src/ATen/native/cuda/GcdLcmKernel.cu
+++ b/aten/src/ATen/native/cuda/GcdLcmKernel.cu
@@ -14,7 +14,7 @@
 namespace at::native {
 
 // See note [Jiterator]
-CONSTEXPR_EXCEPT_WIN_CUDA char gcd_name[] = "gcd";
+constexpr char gcd_name[] = "gcd";
 void gcd_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_cuda", [&]() {
@@ -33,7 +33,7 @@ void gcd_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-CONSTEXPR_EXCEPT_WIN_CUDA char lcm_name[] = "lcm";
+constexpr char lcm_name[] = "lcm";
 void lcm_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu
index 01053a3beeabd..25692dcd4c494 100644
--- a/aten/src/ATen/native/cuda/Lerp.cu
+++ b/aten/src/ATen/native/cuda/Lerp.cu
@@ -9,7 +9,7 @@
 namespace at::native {
 namespace {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char lerp_tensor_name[] = "lerp_tensor";
+constexpr char lerp_tensor_name[] = "lerp_tensor";
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   auto dtype = iter.common_dtype();
   if(at::isComplexType(dtype)) {
@@ -63,7 +63,7 @@ void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char lerp_scalar_name[] = "lerp_scalar";
+constexpr char lerp_scalar_name[] = "lerp_scalar";
 void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight) {
   auto dtype = iter.common_dtype();
   if (at::isComplexType(dtype)) {
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 4f174bf0874f0..eee0047fd7295 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -12,7 +12,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
-CONSTEXPR_EXCEPT_WIN_CUDA char addcmul_name[] = "addcmul";
+constexpr char addcmul_name[] = "addcmul";
 #endif
 void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
@@ -59,7 +59,7 @@ void addcmul_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
 
 #if AT_USE_JITERATOR() && CUDA_VERSION >= 11050
 // return a + alpha * (b / static_cast<accscalar_t>(c));
-CONSTEXPR_EXCEPT_WIN_CUDA char addcdiv_name[] = "addcdiv";
+constexpr char addcdiv_name[] = "addcdiv";
 #endif
 void addcdiv_cuda_kernel(TensorIteratorBase& iter, const Scalar& value) {
   auto dtype = iter.common_dtype();
diff --git a/aten/src/ATen/native/cuda/PowKernel.cu b/aten/src/ATen/native/cuda/PowKernel.cu
index eb56da722fbb8..010818ca213aa 100644
--- a/aten/src/ATen/native/cuda/PowKernel.cu
+++ b/aten/src/ATen/native/cuda/PowKernel.cu
@@ -38,7 +38,7 @@ void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<value_t> base
 }
 
 /* complex<Half> support impl */
-CONSTEXPR_EXCEPT_WIN_CUDA char pow_scalar_base_name[] = "pow_scalar_base_kernel";
+constexpr char pow_scalar_base_name[] = "pow_scalar_base_kernel";
 template <>
 void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<at::Half> base) {
   using scalar_t = c10::complex<at::Half>;
@@ -68,7 +68,7 @@ namespace {
 
 #if AT_USE_JITERATOR()
 /* complex<Half> support impl */
-CONSTEXPR_EXCEPT_WIN_CUDA char pow_name[] = "pow_kernel";
+constexpr char pow_name[] = "pow_kernel";
 static const auto pow_kernel_string =
     jiterator_stringify(template <typename T> T pow_kernel(T base, T exp) {
       return std::pow(base, exp);
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index e628e1916f9e6..dc2f0fa492a7a 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -21,7 +21,7 @@ struct sum_functor {
 };
 
 // jiterated specialization for `complex<Half>`
-CONSTEXPR_EXCEPT_WIN_CUDA char sum_name[] = "sum";
+constexpr char sum_name[] = "sum";
 template <>
 struct sum_functor<c10::complex<at::Half>> {
 // jiterator reduction fails on windows
@@ -57,7 +57,7 @@ struct nansum_functor {
   }
 };
 
-CONSTEXPR_EXCEPT_WIN_CUDA char nansum_name[] = "nansum";
+constexpr char nansum_name[] = "nansum";
 template <typename scalar_t>
 struct nansum_functor_complex {
 #if AT_USE_JITERATOR()
@@ -79,7 +79,7 @@ struct nansum_functor_complex {
 #endif
 };
 
-CONSTEXPR_EXCEPT_WIN_CUDA char prod_name[] = "prod";
+constexpr char prod_name[] = "prod";
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct prod_functor {
   // jiterator reduction fails on windows
diff --git a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
index 14c4e934c69b5..960414f63cda5 100644
--- a/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryComplexKernels.cu
@@ -26,7 +26,7 @@ __host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T>
 }
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char angle_name[] = "angle_kernel";
+constexpr char angle_name[] = "angle_kernel";
 #endif
 
 void angle_kernel_cuda(TensorIteratorBase& iter) {
@@ -63,7 +63,7 @@ void angle_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // NB: Ignores the negative bit on tensors
-CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
+constexpr char conj_name[] = "conj_kernel";
 void conj_kernel_cuda(TensorIteratorBase& iter) {
   auto conj_chalf = [&] {
     using scalar_t = c10::complex<at::Half>;
diff --git a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
index 34ccfa298310e..6448335002cdd 100644
--- a/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryGammaKernels.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char digamma_name[] = "digamma";
+constexpr char digamma_name[] = "digamma";
 #endif // AT_USE_JITERATOR()
 // See note [Jiterator]
 void digamma_kernel_cuda(TensorIteratorBase& iter) {
@@ -40,7 +40,7 @@ void digamma_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-CONSTEXPR_EXCEPT_WIN_CUDA char trigamma_name[] = "trigamma";
+constexpr char trigamma_name[] = "trigamma";
 void trigamma_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -64,7 +64,7 @@ void trigamma_kernel_cuda(TensorIteratorBase& iter) {
   #endif // AT_USE_JITERATOR()
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char polygamma_name[] = "polygamma";
+constexpr char polygamma_name[] = "polygamma";
 void polygamma_kernel_cuda(TensorIteratorBase& iter, int64_t n) {
   if (n == 0) {
     digamma_kernel_cuda(iter);
@@ -101,7 +101,7 @@ void polygamma_kernel_cuda(TensorIteratorBase& iter, int64_t n) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char lgamma_name[] = "lgamma_kernel";
+constexpr char lgamma_name[] = "lgamma_kernel";
 void lgamma_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
index 42ef6a9960cf4..bd779fed2ab43 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAcosKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if 0 && AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char acos_name[] = "acos_impl";
+constexpr char acos_name[] = "acos_impl";
 #endif
 void acos_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
index d621dd246aa49..ab178f6df1f27 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAcoshKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if 0 && AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char acosh_name[] = "acosh_impl";
+constexpr char acosh_name[] = "acosh_impl";
 #endif
 
 void acosh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
index e9b16dd3d2b6d..97a4e2b46e823 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAsinKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if 0 && AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char asin_name[] = "asin_impl";
+constexpr char asin_name[] = "asin_impl";
 #endif
 
 void asin_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
index 7494932f9d538..1a0b2ce9e38c6 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAsinhKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if 0 && AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char asinh_name[] = "asinh_impl";
+constexpr char asinh_name[] = "asinh_impl";
 #endif
 
 void asinh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
index 758d7bc5c86de..5018ac8a31257 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAtanKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char atan_name[] = "atan_impl";
+constexpr char atan_name[] = "atan_impl";
 #endif
 
 void atan_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
index aad7775219af7..71b65815bfea9 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricAtanhKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char atanh_name[] = "atanh_impl";
+constexpr char atanh_name[] = "atanh_impl";
 #endif
 
 void atanh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
index 2a994fb626af4..0cac6ff79c3b5 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricCosKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char cos_name[] = "cos_impl";
+constexpr char cos_name[] = "cos_impl";
 #endif // AT_USE_JITERATOR()
 
 void cos_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
index 49babec1378a3..a5e390c8ec392 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricCoshKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char cosh_name[] = "cosh_impl";
+constexpr char cosh_name[] = "cosh_impl";
 #endif
 
 void cosh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
index d87a190959781..3613192562e44 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricSinKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char sin_name[] = "sin_impl";
+constexpr char sin_name[] = "sin_impl";
 #endif
 
 void sin_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
index 82b730a0ffbc9..039700c21be02 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricSinhKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char sinh_name[] = "sinh_impl";
+constexpr char sinh_name[] = "sinh_impl";
 #endif
 
 void sinh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
index 8f62529e8e095..a71588e551cf0 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char tan_name[] = "tan_impl";
+constexpr char tan_name[] = "tan_impl";
 #endif
 
 void tan_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
index d5f0172015d5e..6a9f6a4cbdd67 100644
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char tanh_name[] = "tanh_impl";
+constexpr char tanh_name[] = "tanh_impl";
 #endif
 
 void tanh_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/aten/src/ATen/native/cuda/UnaryLogKernels.cu b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
index 2a2f56670b78b..f213886319d35 100644
--- a/aten/src/ATen/native/cuda/UnaryLogKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryLogKernels.cu
@@ -13,7 +13,7 @@
 namespace at::native {
 
 #if AT_USE_JITERATOR()
-CONSTEXPR_EXCEPT_WIN_CUDA char log_name[] = "log_kernel";
+constexpr char log_name[] = "log_kernel";
 #endif
 
 void log_kernel_cuda(TensorIteratorBase& iter) {
@@ -47,7 +47,7 @@ void log_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char log10_name[] = "log10_kernel";
+constexpr char log10_name[] = "log10_kernel";
 void log10_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -84,7 +84,7 @@ void log1p_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char log2_name[] = "log2_kernel";
+constexpr char log2_name[] = "log2_kernel";
 void log2_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index b0d6f549ab24d..5eb64ab57258e 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -34,7 +34,7 @@ void bitwise_not_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char exp_name[] = "exp_kernel";
+constexpr char exp_name[] = "exp_kernel";
 void exp_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -92,7 +92,7 @@ C10_HOST_DEVICE static inline c10::complex<T> rsqrt_wrapper(c10::complex<T> v) {
   return one / ::sqrt(v);
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char rsqrt_name[] = "rsqrt_kernel";
+constexpr char rsqrt_name[] = "rsqrt_kernel";
 void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -131,7 +131,7 @@ void rsqrt_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char sqrt_name[] = "sqrt_kernel";
+constexpr char sqrt_name[] = "sqrt_kernel";
 void sqrt_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index 83233f3143cba..2a811e314c2cc 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -25,7 +25,7 @@ void logical_not_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // NB: Ignores the negative bit on tensors
-CONSTEXPR_EXCEPT_WIN_CUDA char neg_name[] = "neg_kernel";
+constexpr char neg_name[] = "neg_kernel";
 void neg_kernel_cuda(TensorIteratorBase& iter) {
   auto dtype = iter.dtype();
   if (at::isComplexType(dtype)) {
@@ -96,7 +96,7 @@ C10_HOST_DEVICE static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char sgn_name[] = "sgn_kernel";
+constexpr char sgn_name[] = "sgn_kernel";
 void sgn_kernel_cuda(TensorIteratorBase& iter){
   auto dtype = iter.dtype();
   #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
index 995995aec7d1a..af560d8e9a50a 100644
--- a/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu
@@ -19,7 +19,7 @@
 
 namespace at::native {
 
-CONSTEXPR_EXCEPT_WIN_CUDA char exp2_name[] = "exp2_kernel";
+constexpr char exp2_name[] = "exp2_kernel";
 void exp2_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
@@ -41,7 +41,7 @@ void exp2_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char i0_name[] = "i0";
+constexpr char i0_name[] = "i0";
 void i0_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0_cuda", [&]() {
@@ -63,7 +63,7 @@ void i0_kernel_cuda(TensorIteratorBase& iter) {
 }
 
 // See note [Jiterator]
-CONSTEXPR_EXCEPT_WIN_CUDA char i0e_name[] = "calc_i0e";
+constexpr char i0e_name[] = "calc_i0e";
 void i0e_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_cuda", [&]() {
@@ -84,7 +84,7 @@ void i0e_kernel_cuda(TensorIteratorBase& iter) {
 
 // See note [Jiterator]
 
-CONSTEXPR_EXCEPT_WIN_CUDA char i1_name[] = "i1";
+constexpr char i1_name[] = "i1";
 void i1_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i1_cuda", [&]() {
@@ -102,7 +102,7 @@ void i1_kernel_cuda(TensorIteratorBase& iter) {
   #endif // AT_USE_JITERATOR()
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char i1e_name[] = "i1e";
+constexpr char i1e_name[] = "i1e";
 void i1e_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i1e_cuda", [&]() {
@@ -120,7 +120,7 @@ void i1e_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_name[] = "sigmoid";
+constexpr char sigmoid_name[] = "sigmoid";
 void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
   auto common_dtype = iter.common_dtype();
   if (at::isComplexType(common_dtype)) {
@@ -159,7 +159,7 @@ void sigmoid_kernel_cuda(TensorIteratorBase& iter) {
   }
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char sinc_name[] = "sinc";
+constexpr char sinc_name[] = "sinc";
 void sinc_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
@@ -217,7 +217,7 @@ void logit_kernel_cuda(TensorIteratorBase& iter, const Scalar& eps_scalar) {
       });
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char ndtri_name[] = "ndtri";
+constexpr char ndtri_name[] = "ndtri";
 void ndtri_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "ndtri_cuda", [&]() {
@@ -234,7 +234,7 @@ void ndtri_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char log_ndtr_name[] = "log_ndtr";
+constexpr char log_ndtr_name[] = "log_ndtr";
 void log_ndtr_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_cuda", [&]() {
@@ -259,7 +259,7 @@ void erf_kernel_cuda(TensorIteratorBase& iter) {
   });
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char erfc_name[] = "erfc_kernel";
+constexpr char erfc_name[] = "erfc_kernel";
 void erfc_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "erfc_cuda", [&]() {
@@ -278,7 +278,7 @@ void erfc_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char erfinv_name[] = "erfinv_kernel";
+constexpr char erfinv_name[] = "erfinv_kernel";
 void erfinv_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "erfinv_cuda", [&]() {
@@ -297,7 +297,7 @@ void erfinv_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char erfcx_name[] = "erfcx";
+constexpr char erfcx_name[] = "erfcx";
 void erfcx_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "erfcx_cuda", [&]() {
@@ -314,7 +314,7 @@ void erfcx_kernel_cuda(TensorIteratorBase& iter) {
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char kaiser_window_name[] = "kaiser_window";
+constexpr char kaiser_window_name[] = "kaiser_window";
 void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length, double beta_){
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
@@ -348,7 +348,7 @@ void kaiser_window_kernel_cuda(TensorIteratorBase& iter, int64_t window_length,
   #endif
 }
 
-CONSTEXPR_EXCEPT_WIN_CUDA char entr_name[] = "entr";
+constexpr char entr_name[] = "entr";
 void entr_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "entr_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/ZetaKernel.cu b/aten/src/ATen/native/cuda/ZetaKernel.cu
index 7459504f508cb..da536e8adbdd6 100644
--- a/aten/src/ATen/native/cuda/ZetaKernel.cu
+++ b/aten/src/ATen/native/cuda/ZetaKernel.cu
@@ -15,7 +15,7 @@ namespace {
  * See note [3-Clause BSD License for the Cephes Math Library].
  */
 // See note [Jiterator]
-CONSTEXPR_EXCEPT_WIN_CUDA char zeta_name[] = "zeta";
+constexpr char zeta_name[] = "zeta";
 void zeta_kernel_cuda(TensorIteratorBase& iter) {
   #if AT_USE_JITERATOR()
     AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_cuda", [&]() {
diff --git a/aten/src/ATen/native/cuda/airy_ai.cu b/aten/src/ATen/native/cuda/airy_ai.cu
index 35e6b002260c2..05257c99b1b22 100644
--- a/aten/src/ATen/native/cuda/airy_ai.cu
+++ b/aten/src/ATen/native/cuda/airy_ai.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-CONSTEXPR_EXCEPT_WIN_CUDA char airy_ai_name[] = "airy_ai_forward";
+constexpr char airy_ai_name[] = "airy_ai_forward";
 
 void airy_ai_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_j0.cu b/aten/src/ATen/native/cuda/bessel_j0.cu
index 2ebfe676e50b9..a3d9b668e9556 100644
--- a/aten/src/ATen/native/cuda/bessel_j0.cu
+++ b/aten/src/ATen/native/cuda/bessel_j0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j0_name[] = "bessel_j0_forward";
+constexpr char bessel_j0_name[] = "bessel_j0_forward";
 
 void bessel_j0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_j1.cu b/aten/src/ATen/native/cuda/bessel_j1.cu
index 42bd43321f40b..674fcadfdff1a 100644
--- a/aten/src/ATen/native/cuda/bessel_j1.cu
+++ b/aten/src/ATen/native/cuda/bessel_j1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
 namespace {
-CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j1_name[] = "bessel_j1_forward";
+constexpr char bessel_j1_name[] = "bessel_j1_forward";
 
 void bessel_j1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_y0.cu b/aten/src/ATen/native/cuda/bessel_y0.cu
index 631031d4e26c5..344ea38765227 100644
--- a/aten/src/ATen/native/cuda/bessel_y0.cu
+++ b/aten/src/ATen/native/cuda/bessel_y0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y0_name[] = "bessel_y0_forward";
+            constexpr char bessel_y0_name[] = "bessel_y0_forward";
 
             void bessel_y0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/bessel_y1.cu b/aten/src/ATen/native/cuda/bessel_y1.cu
index 1375061e43e08..32433a22b0bbc 100644
--- a/aten/src/ATen/native/cuda/bessel_y1.cu
+++ b/aten/src/ATen/native/cuda/bessel_y1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y1_name[] = "bessel_y1_forward";
+            constexpr char bessel_y1_name[] = "bessel_y1_forward";
 
             void bessel_y1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
index 7736d20e01887..a84e0c5050e0c 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_t.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_t_name[] = "chebyshev_polynomial_t_forward";
+            constexpr char chebyshev_polynomial_t_name[] = "chebyshev_polynomial_t_forward";
 
             void chebyshev_polynomial_t_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
index 412479e11f491..9ec870fd130a8 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_u.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_u_name[] = "chebyshev_polynomial_u_forward";
+            constexpr char chebyshev_polynomial_u_name[] = "chebyshev_polynomial_u_forward";
 
             void chebyshev_polynomial_u_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
index ca2e534e641b6..7f393d9d674de 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_v.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_v_name[] = "chebyshev_polynomial_v_forward";
+            constexpr char chebyshev_polynomial_v_name[] = "chebyshev_polynomial_v_forward";
 
             void chebyshev_polynomial_v_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu b/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
index 9d5a0e3a7bd33..9897213ee97d2 100644
--- a/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
+++ b/aten/src/ATen/native/cuda/chebyshev_polynomial_w.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_w_name[] = "chebyshev_polynomial_w_forward";
+            constexpr char chebyshev_polynomial_w_name[] = "chebyshev_polynomial_w_forward";
 
             void chebyshev_polynomial_w_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/hermite_polynomial_h.cu b/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
index f53253bcd0994..d581e38bbefef 100644
--- a/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
+++ b/aten/src/ATen/native/cuda/hermite_polynomial_h.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_h_name[] = "hermite_polynomial_h_forward";
+            constexpr char hermite_polynomial_h_name[] = "hermite_polynomial_h_forward";
 
             void hermite_polynomial_h_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/hermite_polynomial_he.cu b/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
index bab376565858a..b5b1891b80cf8 100644
--- a/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
+++ b/aten/src/ATen/native/cuda/hermite_polynomial_he.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
+            constexpr char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
 
             void hermite_polynomial_he_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu b/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
index a98336dfcb6e3..0490fc97cc548 100644
--- a/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
+++ b/aten/src/ATen/native/cuda/laguerre_polynomial_l.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char laguerre_polynomial_l_name[] = "laguerre_polynomial_l_forward";
+            constexpr char laguerre_polynomial_l_name[] = "laguerre_polynomial_l_forward";
 
             void laguerre_polynomial_l_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_i0.cu b/aten/src/ATen/native/cuda/modified_bessel_i0.cu
index 9f1f3ba98c679..5d5e60c132c99 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_i0.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_i0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i0_name[] = "modified_bessel_i0_forward";
+            constexpr char modified_bessel_i0_name[] = "modified_bessel_i0_forward";
 
             void modified_bessel_i0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_i1.cu b/aten/src/ATen/native/cuda/modified_bessel_i1.cu
index d51e7fefb0eb1..4576ce07042e6 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_i1.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_i1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i1_name[] = "modified_bessel_i1_forward";
+            constexpr char modified_bessel_i1_name[] = "modified_bessel_i1_forward";
 
             void modified_bessel_i1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_k0.cu b/aten/src/ATen/native/cuda/modified_bessel_k0.cu
index 574268456c847..17de0d94a69a4 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_k0.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_k0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k0_name[] = "modified_bessel_k0_forward";
+            constexpr char modified_bessel_k0_name[] = "modified_bessel_k0_forward";
 
             void modified_bessel_k0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/modified_bessel_k1.cu b/aten/src/ATen/native/cuda/modified_bessel_k1.cu
index b3720d8e1ba98..a858ad52af6a9 100644
--- a/aten/src/ATen/native/cuda/modified_bessel_k1.cu
+++ b/aten/src/ATen/native/cuda/modified_bessel_k1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k1_name[] = "modified_bessel_k1_forward";
+            constexpr char modified_bessel_k1_name[] = "modified_bessel_k1_forward";
 
             void modified_bessel_k1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu b/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
index ac2355e409ac2..880b6b54c1873 100644
--- a/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
+++ b/aten/src/ATen/native/cuda/scaled_modified_bessel_k0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k0_name[] = "scaled_modified_bessel_k0_forward";
+            constexpr char scaled_modified_bessel_k0_name[] = "scaled_modified_bessel_k0_forward";
 
             void scaled_modified_bessel_k0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu b/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
index b1d8d2a41b62b..7e5c771dc80b1 100644
--- a/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
+++ b/aten/src/ATen/native/cuda/scaled_modified_bessel_k1.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k1_name[] = "scaled_modified_bessel_k1_forward";
+            constexpr char scaled_modified_bessel_k1_name[] = "scaled_modified_bessel_k1_forward";
 
             void scaled_modified_bessel_k1_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
index d86042030cd69..e08081495ecb0 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_t.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_t_name[] = "shifted_chebyshev_polynomial_t_forward";
+            constexpr char shifted_chebyshev_polynomial_t_name[] = "shifted_chebyshev_polynomial_t_forward";
 
             void shifted_chebyshev_polynomial_t_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
index a2e2cd485fdaf..12fe938334a20 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_u.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_u_name[] = "shifted_chebyshev_polynomial_u_forward";
+            constexpr char shifted_chebyshev_polynomial_u_name[] = "shifted_chebyshev_polynomial_u_forward";
 
             void shifted_chebyshev_polynomial_u_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
index 6e5404179ab93..19db5a5ed53dd 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_v.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
 namespace {
-CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_v_name[] = "shifted_chebyshev_polynomial_v_forward";
+constexpr char shifted_chebyshev_polynomial_v_name[] = "shifted_chebyshev_polynomial_v_forward";
 
 void shifted_chebyshev_polynomial_v_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
index 3bfee57d14ee3..d53b026947a62 100644
--- a/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
+++ b/aten/src/ATen/native/cuda/shifted_chebyshev_polynomial_w.cu
@@ -10,7 +10,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_w_name[] = "shifted_chebyshev_polynomial_w_forward";
+            constexpr char shifted_chebyshev_polynomial_w_name[] = "shifted_chebyshev_polynomial_w_forward";
 
             void shifted_chebyshev_polynomial_w_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/aten/src/ATen/native/cuda/spherical_bessel_j0.cu b/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
index d0bf46e653946..14234b27e54e0 100644
--- a/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
+++ b/aten/src/ATen/native/cuda/spherical_bessel_j0.cu
@@ -20,7 +20,7 @@
 
 namespace at::native {
         namespace {
-            CONSTEXPR_EXCEPT_WIN_CUDA char spherical_bessel_j0_name[] = "spherical_bessel_j0_forward";
+            constexpr char spherical_bessel_j0_name[] = "spherical_bessel_j0_forward";
 
             void spherical_bessel_j0_kernel_cuda(TensorIteratorBase& iterator) {
 #if AT_USE_JITERATOR()
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index a763359ae71bb..157b2a51287ab 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -445,66 +445,14 @@ __host__ __device__
 #define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
 #endif
 
-#if defined(__CUDA_ARCH__)
-#if defined(_MSC_VER) && defined(__CUDACC__)
-#define CONSTEXPR_EXCEPT_WIN_CUDA const
-#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
-
-// Note [static constexpr char* members for windows NVCC]
-// The Windows NVCC compiler doesn't handle static constexpr class members,
-// although it's fixed in a later version.
-// (see
-// https://developercommunity.visualstudio.com/t/intellisense-error-c11-static-constexpr-member-ini/245425)
-//
-// If we want to ensure that our field is static under all builds, then we need
-// to work around it specifically for windows NVCC by making it (a) const, (b)
-// defined outside of the class definition We need to define it outside of the
-// class definition because of the C++ standard; char* is not an integral type
-// (see
-// https://stackoverflow.com/questions/24278473/intellisense-a-member-of-type-const-char-const-cannot-have-an-in-class-in)
-//
-// So instead of this:
-// struct Foo {
-//     static constexpr const char* name = "foo";
-// }
-// In Windows NVCC, we end up with this:
-// struct Foo {
-//     static const char* name;
-// }
-// const char* Foo::name = "foo";
-//
-// This gives us a small perf hit for any code that wants to access these field
-// members, but right now it isn't used in any perf-critical code paths.
-#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
-  static const char* field;
-#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) \
-  const char* cls::field = val;
-#else
-#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
-#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
-
-#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
-  static constexpr const char* field = val;
-#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
-#endif
-#else
-#if defined(_MSC_VER) && defined(__CUDACC__)
-#define CONSTEXPR_EXCEPT_WIN_CUDA const
-#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA
-
-#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
-  static const char* field;
-#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) \
-  const char* cls::field = val;
-#else
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
 #define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
 #define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
 
 #define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
-  static constexpr const char* field = val;
+  static constexpr const char field[] = val;
 #define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
-#endif
-#endif
+#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
 
 #ifndef HAS_DEMANGLE
 #if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index db9dbdc478122..c977d7e92b2a6 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -76,13 +76,13 @@ class ArrayRef final {
   constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
 
   /// Construct an ArrayRef from a pointer and length.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef(const T* data, size_t length)
+  constexpr ArrayRef(const T* data, size_t length)
       : Data(data), Length(length) {
     debugCheckNullptrInvariant();
   }
 
   /// Construct an ArrayRef from a range.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef(const T* begin, const T* end)
+  constexpr ArrayRef(const T* begin, const T* end)
       : Data(begin), Length(end - begin) {
     debugCheckNullptrInvariant();
   }
@@ -182,14 +182,14 @@ class ArrayRef final {
   }
 
   /// front - Get the first element.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& front() const {
+  constexpr const T& front() const {
     TORCH_CHECK(
         !empty(), "ArrayRef: attempted to access front() of empty list");
     return Data[0];
   }
 
   /// back - Get the last element.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& back() const {
+  constexpr const T& back() const {
     TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
     return Data[Length - 1];
   }
@@ -200,8 +200,7 @@ class ArrayRef final {
   }
 
   /// slice(n, m) - Take M elements of the array starting at element N
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef<T> slice(size_t N, size_t M)
-      const {
+  constexpr ArrayRef<T> slice(size_t N, size_t M) const {
     TORCH_CHECK(
         N + M <= size(),
         "ArrayRef: invalid slice, N = ",
@@ -214,7 +213,7 @@ class ArrayRef final {
   }
 
   /// slice(n) - Chop off the first N elements of the array.
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef<T> slice(size_t N) const {
+  constexpr ArrayRef<T> slice(size_t N) const {
     TORCH_CHECK(
         N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
     return slice(N, size() - N);
@@ -228,7 +227,7 @@ class ArrayRef final {
   }
 
   /// Vector compatibility
-  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& at(size_t Index) const {
+  constexpr const T& at(size_t Index) const {
     TORCH_CHECK(
         Index < Length,
         "ArrayRef: invalid index Index = ",
diff --git a/c10/util/ConstexprCrc.h b/c10/util/ConstexprCrc.h
index 0eec44d576e98..96f1113a14c8c 100644
--- a/c10/util/ConstexprCrc.h
+++ b/c10/util/ConstexprCrc.h
@@ -98,8 +98,10 @@ constexpr uint64_t crc64_table[] = {
     0x29b7d047efec8728,
 };
 
-inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA uint64_t
-crc64impl(uint64_t accumulator, const char* data, size_t size) {
+inline constexpr uint64_t crc64impl(
+    uint64_t accumulator,
+    const char* data,
+    size_t size) {
   for (size_t i = 0; i < size; ++i) {
     accumulator =
         crc64_table[(accumulator ^ data[i]) & 0xFF] ^ (accumulator >> 8);
@@ -116,12 +118,11 @@ struct crc64_t final : IdWrapper<crc64_t, uint64_t> {
 };
 
 // CRC64 with Jones coefficients and an init value of 0.
-inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA crc64_t
-crc64(const char* str, size_t size) {
+inline constexpr crc64_t crc64(const char* str, size_t size) {
   return crc64_t{detail::crc64impl(0, str, size)};
 }
 
-inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA crc64_t crc64(c10::string_view str) {
+inline constexpr crc64_t crc64(c10::string_view str) {
   return crc64(str.data(), str.size());
 }
 } // namespace c10::util
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index ebf44b327168c..13f8a2adec085 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -71,7 +71,7 @@ class C10_API TypeIdentifier final
    * is generated during run-time. Do NOT serialize the id for storage.
    */
   template <typename T>
-  static C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA TypeIdentifier Get() noexcept {
+  static constexpr TypeIdentifier Get() noexcept {
     return TypeIdentifier(c10::util::get_type_index<T>());
   }
 
@@ -425,7 +425,7 @@ class C10_API TypeMeta final {
   // Below are static functions that can be called by passing a specific type.
 
   template <class T>
-  static C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA TypeIdentifier Id() noexcept {
+  static constexpr TypeIdentifier Id() noexcept {
     return TypeIdentifier::Get<T>();
   }
 
diff --git a/tools/linter/adapters/constexpr_linter.py b/tools/linter/adapters/constexpr_linter.py
deleted file mode 100644
index adb7fe001749a..0000000000000
--- a/tools/linter/adapters/constexpr_linter.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-CONSTEXPR: Ensures users don't use vanilla constexpr since it causes issues
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import sys
-from enum import Enum
-from typing import NamedTuple
-
-
-CONSTEXPR = "constexpr char"
-CONSTEXPR_MACRO = "CONSTEXPR_EXCEPT_WIN_CUDA char"
-
-LINTER_CODE = "CONSTEXPR"
-
-
-class LintSeverity(str, Enum):
-    ERROR = "error"
-
-
-class LintMessage(NamedTuple):
-    path: str | None
-    line: int | None
-    char: int | None
-    code: str
-    severity: LintSeverity
-    name: str
-    original: str | None
-    replacement: str | None
-    description: str | None
-
-
-def check_file(filename: str) -> LintMessage | None:
-    logging.debug("Checking file %s", filename)
-
-    with open(filename) as f:
-        lines = f.readlines()
-
-    for idx, line in enumerate(lines):
-        if CONSTEXPR in line:
-            original = "".join(lines)
-            replacement = original.replace(CONSTEXPR, CONSTEXPR_MACRO)
-            logging.debug("replacement: %s", replacement)
-            return LintMessage(
-                path=filename,
-                line=idx,
-                char=None,
-                code=LINTER_CODE,
-                severity=LintSeverity.ERROR,
-                name="Vanilla constexpr used, prefer macros",
-                original=original,
-                replacement=replacement,
-                description="Vanilla constexpr used, prefer macros run `lintrunner --take CONSTEXPR -a` to apply changes.",
-            )
-    return None
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="CONSTEXPR linter",
-        fromfile_prefix_chars="@",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "filenames",
-        nargs="+",
-        help="paths to lint",
-    )
-
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        format="<%(threadName)s:%(levelname)s> %(message)s",
-        level=logging.NOTSET
-        if args.verbose
-        else logging.DEBUG
-        if len(args.filenames) < 1000
-        else logging.INFO,
-        stream=sys.stderr,
-    )
-
-    lint_messages = []
-    for filename in args.filenames:
-        lint_message = check_file(filename)
-        if lint_message is not None:
-            lint_messages.append(lint_message)
-
-    for lint_message in lint_messages:
-        print(json.dumps(lint_message._asdict()), flush=True)
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index 558dcdee57af2..8e9be1de48a5f 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -593,7 +593,7 @@ struct TORCH_API ModulePolicy {
   }
   // are we going to return everything? If so, we can optimize the calculate
   // of the size of the list.
-  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+  static constexpr bool all_slots = false;
 };
 
 struct TORCH_API ParameterPolicy {
@@ -606,7 +606,7 @@ struct TORCH_API ParameterPolicy {
   static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
     return typ->is_parameter(i) && v.isTensor();
   }
-  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+  static constexpr bool all_slots = false;
 };
 
 struct TORCH_API BufferPolicy {
@@ -620,7 +620,7 @@ struct TORCH_API BufferPolicy {
     return typ->getAttribute(i)->isSubtypeOf(*TensorType::get()) &&
         typ->is_buffer(i);
   }
-  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+  static constexpr bool all_slots = false;
 };
 
 struct TORCH_API AttributePolicy {
@@ -633,7 +633,7 @@ struct TORCH_API AttributePolicy {
   static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
     return true;
   }
-  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = true;
+  static constexpr bool all_slots = true;
 };
 
 // take a Policy object, and make a version of it that returns the slot.
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 9be9b0fb2d8c1..cf45166d464d7 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -216,7 +216,7 @@ class TORCH_API Pickler {
   // the left of a '::', its type cannot be deduced by the compiler so one must
   // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
   // does not)
-  static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256;
+  static constexpr size_t kBufferSize = 256;
   template <typename T>
   void push(std::common_type_t<T> value) {
     const char* begin = reinterpret_cast<const char*>(&value);
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 3c81fc0596599..ab918577c6160 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -600,19 +600,15 @@ def __call__(self, f: NativeFunction) -> str:
   using schema = {sig.type()};
   using ptr_schema = schema*;
   // See Note [static constexpr char* members for windows NVCC]
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::{f.func.name.name}")
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "{f.func.name.overload_name}")
-  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, {cpp_string(str(f.func))})
+  static constexpr const char* name = "aten::{f.func.name.name}";
+  static constexpr const char* overload_name = "{f.func.name.overload_name}";
+  static constexpr const char* schema_str = {cpp_string(str(f.func))};
   static {sig.defn(name="call", is_redispatching_fn=False)};
   static {sig.defn(name="redispatch", is_redispatching_fn=True)};
 }};"""
 
         elif self.target is Target.DEFINITION:
             defns = f"""
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, name, "aten::{f.func.name.name}")
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, overload_name, "{f.func.name.overload_name}")
-STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA({name}, schema_str, {cpp_string(str(f.func))})
-
 // aten::{f.func}
 static C10_NOINLINE c10::TypedOperatorHandle<{name}::schema> create_{name}_typed_handle() {{
   return c10::Dispatcher::singleton()

From 0efa590d435d2b4aefcbad9014dd5fa75dcf8405 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Thu, 24 Oct 2024 11:39:01 +0000
Subject: [PATCH 016/161] [CI] Fix XPU CI failure (#138548)

# Motivation
Fix https://github.com/pytorch/pytorch/issues/138577.

# Solution
1. All UTs in `test/inductor/test_compiled_optimizers.py` are fixed by https://github.com/pytorch/pytorch/pull/134170
2. UT in `test/inductor/test_pattern_matcher.py` is introduced by https://github.com/pytorch/pytorch/pull/138089, we will skip this UT due to the unsupported feature `max_autotune_gemm_backends:Triton`.
3. We have a new impl related to `histc`, so we remove the expected failure from `test/inductor/test_torchinductor_opinfo.py`
4. We support `avg_pool3d` for `fp16` data type, so we remove the expected failure from `test/inductor/test_torchinductor_opinfo.py`
5. CUDA-bias code is introduced by https://github.com/pytorch/pytorch/issues/138472, we just generalize it to `GPU_TYPE`.

# Additional Context
> Why update torch-xpu-ops commit pin here?

We have to update commit pin to avoid the build failure raised by the code change [C10_UNUSED](https://github.com/pytorch/pytorch/pull/138364).

> What does the feature of torch-xpu-ops update?

1. Add some foreach ops, like `unary ops` and `foreach_clamp_max` etc;
2. Add some maxpool ops forward and backward, like `averge_pool3d` and `max_pool3d`
3. Add some other ops, like `log_normal_`, `index_copy`, and `mode` etc;
4. fix build failure related to `C10_UNUSED`;

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138548
Approved by: https://github.com/malfet, https://github.com/EikanWang
---
 test/inductor/test_compiled_optimizers.py  | 5 +----
 test/inductor/test_pattern_matcher.py      | 1 +
 test/inductor/test_torchinductor_opinfo.py | 2 --
 test/inductor/test_triton_kernels.py       | 2 +-
 third_party/xpu.txt                        | 2 +-
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 7e0f607fbcb49..d070cb92abe53 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -121,6 +121,7 @@ class KernelCounts(NamedTuple):
     "test_adamw_amsgrad_capturable_cuda": 6,
     "test_adamw_amsgrad_capturable_xpu": 6,
     "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": 6,
+    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": 6,
     "test_adamw_tensor_lr_amsgrad_capturable_cuda": 6,
     "test_adamw_tensor_lr_amsgrad_capturable_xpu": 6,
     "test_adam_tensor_lr_amsgrad_capturable_cuda": 6,
@@ -153,7 +154,6 @@ class KernelCounts(NamedTuple):
     "test_sgd_cuda": 4,
     "test_sgd_cpu": 4,
     "test_sgd_xpu": 4,
-    "test_rmsprop_tensor_lr_capturable_foreach_xpu": 4,
     "test_adagrad_initial_accumulator_value_weight_decay_foreach_xpu": 2,
     "test_adagrad_lr_decay_weight_decay_foreach_xpu": 2,
     "test_adagrad_weight_decay_foreach_xpu": 2,
@@ -167,14 +167,11 @@ class KernelCounts(NamedTuple):
     "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": 8,
     "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": 6,
     "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": 9,
-    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_foreach_xpu": 3,
     "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": 6,
     "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": 6,
-    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_foreach_xpu": 3,
     "test_sgd_tensor_lr_cpu": 2,
     "test_sgd_tensor_lr_cuda": 2,
     "test_sgd_tensor_lr_xpu": 2,
-    "test_sgd_tensor_lr_foreach_xpu": 2,
 }
 
 # also tracks currently supported optimizers
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index 58387f9e5dd1d..d6bfdbcc05f91 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -1234,6 +1234,7 @@ def remap_fake_tensor(x):
                 # of search_fn).
                 self.assertTrue(pattern.pattern_eq(search_fn_pattern))
 
+    @skipIfXpu
     @inductor_config.patch(
         {
             "triton.unique_kernel_names": "original_aten",
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 6cc5cd9fd26c1..f2cc17e258f60 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -351,11 +351,9 @@ def format_op(op):
     "nn.functional.conv_transpose3d": {f32, f64},
     # rrelu not supported on XPU now
     "nn.functional.rrelu": {f16, f32, f64},
-    "histc": {i32, i64},
     # not implemented for 'Half'
     "nn.functional.multilabel_margin_loss": {f16},
     "nn.functional.multi_margin_loss": {f16},
-    "nn.functional.avg_pool3d": {f16},
 }
 
 
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index e08963e55bcdb..f1475dae694fe 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -2158,7 +2158,7 @@ def sin_triton(x, out):
             n_elements = out.numel()
             sin_kernel[(n_elements,)](x, out, n_elements)
 
-        x = torch.randn(65, device="cuda")
+        x = torch.randn(65, device=GPU_TYPE)
         out = torch.empty_like(x)
         out_compiled = torch.empty_like(x)
         sin_triton_compiled = torch.compile(fullgraph=True)(sin_triton)
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 711935d34da44..b79c4dba924e9 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-1d217ae491669b550b136ca16e91b85c4597cd66
+b3d5d78c72eadc5140aef1f8e06844385e9a2d45

From e7d4de0e598d2af4d432e299b47ad95e821edd19 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 24 Oct 2024 10:20:58 +0000
Subject: [PATCH 017/161] Eliminate C10_TYPENAME_CONSTEXPR (#138702)

Test Plan: Sandcastle

Differential Revision: D64833560

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138702
Approved by: https://github.com/malfet
---
 c10/test/util/TypeIndex_test.cpp | 23 ++++------
 c10/util/TypeIndex.h             | 75 +++-----------------------------
 2 files changed, 14 insertions(+), 84 deletions(-)

diff --git a/c10/test/util/TypeIndex_test.cpp b/c10/test/util/TypeIndex_test.cpp
index 370a0ad81ba3d..5979d92edd592 100644
--- a/c10/test/util/TypeIndex_test.cpp
+++ b/c10/test/util/TypeIndex_test.cpp
@@ -55,11 +55,11 @@ static_assert(
     "");
 
 namespace test_top_level_name {
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
+
 static_assert(
     string_view::npos != get_fully_qualified_type_name<Dummy>().find("Dummy"),
     "");
-#endif
+
 TEST(TypeIndex, TopLevelName) {
   EXPECT_NE(
       string_view::npos, get_fully_qualified_type_name<Dummy>().find("Dummy"));
@@ -69,12 +69,11 @@ TEST(TypeIndex, TopLevelName) {
 namespace test_nested_name {
 struct Dummy final {};
 
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
 static_assert(
     string_view::npos !=
         get_fully_qualified_type_name<Dummy>().find("test_nested_name::Dummy"),
     "");
-#endif
+
 TEST(TypeIndex, NestedName) {
   EXPECT_NE(
       string_view::npos,
@@ -87,7 +86,6 @@ template <class T>
 struct Outer final {};
 struct Inner final {};
 
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
 static_assert(
     string_view::npos !=
         get_fully_qualified_type_name<Outer<Inner>>().find(
@@ -98,7 +96,7 @@ static_assert(
         get_fully_qualified_type_name<Outer<Inner>>().find(
             "test_type_template_parameter::Inner"),
     "");
-#endif
+
 TEST(TypeIndex, TypeTemplateParameter) {
   EXPECT_NE(
       string_view::npos,
@@ -115,12 +113,11 @@ namespace test_nontype_template_parameter {
 template <size_t N>
 struct Class final {};
 
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
 static_assert(
     string_view::npos !=
         get_fully_qualified_type_name<Class<38474355>>().find("38474355"),
     "");
-#endif
+
 TEST(TypeIndex, NonTypeTemplateParameter) {
   EXPECT_NE(
       string_view::npos,
@@ -134,7 +131,6 @@ struct Type final {
   using type = const T*;
 };
 
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
 static_assert(
     string_view::npos !=
         get_fully_qualified_type_name<typename Type<int>::type>().find("int"),
@@ -151,7 +147,7 @@ static_assert(
             std::remove_pointer_t<typename Type<int>::type>>()
             .find("*"),
     "");
-#endif
+
 TEST(TypeIndex, TypeComputationsAreResolved) {
   EXPECT_NE(
       string_view::npos,
@@ -170,14 +166,14 @@ TEST(TypeIndex, TypeComputationsAreResolved) {
 struct Functor final {
   std::string operator()(int64_t a, const Type<int>& b) const;
 };
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
+
 static_assert(
     // NOLINTNEXTLINE(misc-redundant-expression)
     get_fully_qualified_type_name<std::string(int64_t, const Type<int>&)>() ==
         get_fully_qualified_type_name<
             typename c10::guts::infer_function_traits_t<Functor>::func_type>(),
     "");
-#endif
+
 TEST(TypeIndex, FunctionTypeComputationsAreResolved) {
   EXPECT_EQ(
       get_fully_qualified_type_name<std::string(int64_t, const Type<int>&)>(),
@@ -189,7 +185,6 @@ TEST(TypeIndex, FunctionTypeComputationsAreResolved) {
 namespace test_function_arguments_and_returns {
 class Dummy final {};
 
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
 static_assert(
     string_view::npos !=
         get_fully_qualified_type_name<Dummy(int)>().find(
@@ -200,7 +195,7 @@ static_assert(
         get_fully_qualified_type_name<void(Dummy)>().find(
             "test_function_arguments_and_returns::Dummy"),
     "");
-#endif
+
 TEST(TypeIndex, FunctionArgumentsAndReturns) {
   EXPECT_NE(
       string_view::npos,
diff --git a/c10/util/TypeIndex.h b/c10/util/TypeIndex.h
index 75b672d4a183f..d4af28daf52be 100644
--- a/c10/util/TypeIndex.h
+++ b/c10/util/TypeIndex.h
@@ -9,56 +9,12 @@
 #include <string>
 #include <type_traits>
 
-namespace c10::util {
-
-// TODO Make it work for more compilers
-
-// Intel compiler works
-#if defined(__INTEL_COMPILER)
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
-#define C10_TYPENAME_CONSTEXPR
-
-// Clang works
-#elif defined(__clang__)
-
-// except for NVCC
-#if defined(__CUDACC__)
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
-#define C10_TYPENAME_CONSTEXPR
-#else
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
 #define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
 #define C10_TYPENAME_CONSTEXPR constexpr
 #endif
 
-// Windows works
-#elif defined(_MSC_VER)
-
-// except for NVCC
-#if defined(__CUDACC__)
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
-#define C10_TYPENAME_CONSTEXPR
-#else
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
-#define C10_TYPENAME_CONSTEXPR constexpr
-#endif
-
-// GCC works
-#elif defined(__GNUC__)
-
-// except when gcc < 9
-#if (__GNUC__ < 9) || defined(__CUDACC__)
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
-#define C10_TYPENAME_CONSTEXPR
-#else
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
-#define C10_TYPENAME_CONSTEXPR constexpr
-#endif
-
-// some other compiler we don't know about
-#else
-#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
-#define C10_TYPENAME_CONSTEXPR constexpr
-#endif
+namespace c10::util {
 
 struct type_index final : IdWrapper<type_index, uint64_t> {
   constexpr explicit type_index(uint64_t checksum) : IdWrapper(checksum) {}
@@ -76,17 +32,6 @@ struct type_index final : IdWrapper<type_index, uint64_t> {
 
 namespace detail {
 
-#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
-    __GNUC__ < 5
-// Getting __PRETTY_FUNCTION__ at compile time only works with GCC >= 5
-#error "You're running a too old version of GCC. We need GCC 5 or later."
-#endif
-
-#if defined(__clang__) && __clang_major__ < 4
-// Getting __PRETTY_FUNCTION__ at compile time only works with Clang >= 4
-#error "You're running a too old version of Clang. We need Clang 4 or later."
-#endif
-
 inline constexpr string_view extract(
     string_view prefix,
     string_view suffix,
@@ -101,7 +46,7 @@ inline constexpr string_view extract(
 }
 
 template <typename T>
-inline C10_TYPENAME_CONSTEXPR c10::string_view fully_qualified_type_name_impl() {
+inline constexpr c10::string_view fully_qualified_type_name_impl() {
 #if defined(_MSC_VER) && !defined(__clang__)
 #if defined(__NVCC__)
   return extract(
@@ -121,11 +66,7 @@ inline C10_TYPENAME_CONSTEXPR c10::string_view fully_qualified_type_name_impl()
       __PRETTY_FUNCTION__);
 #elif defined(__GNUC__)
   return extract(
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
       "constexpr c10::string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ",
-#else
-      "c10::string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ",
-#endif
       "; c10::string_view = c10::basic_string_view<char>]",
       __PRETTY_FUNCTION__);
 #endif
@@ -181,14 +122,8 @@ inline constexpr type_index get_type_index<std::string>() {
 #endif
 
 template <typename T>
-inline C10_TYPENAME_CONSTEXPR string_view
-get_fully_qualified_type_name() noexcept {
-#if C10_TYPENAME_SUPPORTS_CONSTEXPR
-  constexpr
-#else
-  static
-#endif
-      string_view name = detail::fully_qualified_type_name_impl<T>();
+inline constexpr string_view get_fully_qualified_type_name() noexcept {
+  constexpr string_view name = detail::fully_qualified_type_name_impl<T>();
   return name;
 }
 } // namespace c10::util

From 0d9fb510288c2f2c81655ff5c236fe11a6ef8001 Mon Sep 17 00:00:00 2001
From: Nick Westlake <nicholasw@graphcore.ai>
Date: Thu, 24 Oct 2024 10:43:34 +0000
Subject: [PATCH 018/161] Fix lru_cache where config is used (#134235)

Ensure that any use of functools.lru_cache does not prevent config from being changed after the function has already run.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/134235
Approved by: https://github.com/masnesral
---
 torch/_dynamo/variables/builder.py |  8 ++++++--
 torch/_inductor/codecache.py       | 16 ++++++++++------
 torch/_inductor/cpu_vec_isa.py     |  9 ++++++---
 torch/_inductor/metrics.py         |  6 ++++--
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index f8d8f0f54dc06..a47948dc541f0 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -433,8 +433,12 @@ def set_source_and_track_mutable(self, value, var):
         return self.tx.output.side_effects.track_mutable(value, var)
 
     @classmethod
-    @functools.lru_cache(None)
     def _type_dispatch(cls):
+        return cls._type_dispatch_impl(config.trace_numpy)
+
+    @classmethod
+    @functools.lru_cache(None)
+    def _type_dispatch_impl(cls, trace_numpy):
         # NB: Careful not to close over self to avoid ref cycle from lru_cache
         entries = [
             (
@@ -460,7 +464,7 @@ def _type_dispatch(cls):
             (torch.jit.ScriptFunction, cls.wrap_jit_function),
         ]
 
-        if config.trace_numpy and np:
+        if trace_numpy and np:
             entries.append((np.ndarray, cls.wrap_numpy_ndarray))
 
         result = {}
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 075fbfb090b1a..738669eec93c7 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -198,6 +198,15 @@ def get_cpp_wrapper_cubin_path_name() -> str:
     return "cubin_path" if torch.version.hip is None else "hsaco_path"
 
 
+@functools.lru_cache(None)
+def get_global_cache_path_impl(global_cache_dir: str) -> Optional[Path]:
+    return (
+        Path(os.path.join(global_cache_dir, CacheBase.get_system()["hash"]))
+        if global_cache_dir is not None
+        else None
+    )
+
+
 class CacheBase:
     @staticmethod
     @functools.lru_cache(None)
@@ -244,13 +253,8 @@ def get_local_cache_path() -> Path:
         return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
 
     @staticmethod
-    @functools.lru_cache(None)
     def get_global_cache_path() -> Optional[Path]:
-        return (
-            Path(os.path.join(config.global_cache_dir, CacheBase.get_system()["hash"]))
-            if config.global_cache_dir is not None
-            else None
-        )
+        return get_global_cache_path_impl(config.global_cache_dir)
 
     def __init__(self) -> None:
         self.system = CacheBase.get_system()
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index 96e8cf55e50a1..9b05a35d4e190 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -138,10 +138,13 @@ def check_build(self, code: str) -> bool:
 
             return True
 
-    @functools.lru_cache(None)  # noqa: B019
     def __bool__(self) -> bool:
-        if config.cpp.vec_isa_ok is not None:
-            return config.cpp.vec_isa_ok
+        return self.__bool__impl(config.cpp.vec_isa_ok)
+
+    @functools.lru_cache(None)  # noqa: B019
+    def __bool__impl(self, vec_isa_ok) -> bool:
+        if vec_isa_ok is not None:
+            return vec_isa_ok
 
         if config.is_fbcode():
             return True
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index fe77279800e3d..bc374729dc656 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -411,10 +411,12 @@ def purge_old_log_files():
             table.write_header()
 
 
-@lru_cache
 def enabled_metric_tables() -> Set[str]:
-    config_str = config.enabled_metric_tables
+    return enabled_metric_tables_impl(config.enabled_metric_tables)
+
 
+@lru_cache
+def enabled_metric_tables_impl(config_str: str) -> Set[str]:
     enabled = set()
     for name in config_str.split(","):
         name = name.strip()

From 5c88a9f6c0b65984d0e5ac96fd199d2aabd96a93 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <ifernando@quansight.com>
Date: Fri, 4 Oct 2024 16:35:16 +0000
Subject: [PATCH 019/161] Assume that indices are non-negative in
 _unsafe_masked_index (#137315)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137315
Approved by: https://github.com/eellison
---
 aten/src/ATen/native/TensorAdvancedIndexing.cpp | 2 +-
 test/inductor/test_torchinductor.py             | 4 ++--
 torch/_inductor/lowering.py                     | 9 +++++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index ac5d3a87d8d4f..b2f7d78652552 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -662,7 +662,7 @@ Tensor _unsafe_masked_index(const Tensor& self, const Tensor& mask, const torch:
   // with the main difference being that the when the `mask` is false, the tensor
   // `self` is not indexed using `indices`. This allows `indices` to be out-of-bounds
   // when `mask` is false. When `mask` is true, the `indices` are expected to be
-  // in bounds and is not checked.
+  // in bounds and is not checked. We also assume that the `indices` are non-negative
   //
   // This function is not meant to be executed on eager mode. An unoptimized version
   // is provided here.
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index c792af55ed367..89aeb545b270e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1695,7 +1695,7 @@ def fn(a, mask, idx):
             (
                 torch.randn(8, device=self.device),
                 torch.tensor([True, False, True], device=self.device),
-                [torch.tensor([3, 9, -2], device=self.device)],
+                [torch.tensor([3, 9, 2], device=self.device)],
             ),
         )
 
@@ -1708,7 +1708,7 @@ def fn(a, mask, idx, values):
             (
                 torch.randn(8, device=self.device),
                 torch.tensor([True, False, True], device=self.device),
-                [torch.tensor([3, 9, -2], device=self.device)],
+                [torch.tensor([3, 9, 2], device=self.device)],
                 torch.randn(3, device=self.device),
             ),
         )
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index aace06bdeba1b..6062cbcbb9fd5 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3162,6 +3162,7 @@ def index_output_size_and_inner_fn(
     indexed_size,
     x_loader,
     check,
+    wrap_neg=True,
 ):
     # Note that behavior of indexing differs when there are non consecutive
     # tensors. In this case, the tensor index is pulled to the beginning.
@@ -3213,6 +3214,7 @@ def fn(idx):
                         loader(idx[start_offset : start_offset + rank]),
                         size,
                         check=check,
+                        wrap_neg=wrap_neg,
                     )
                 )
         new_index = [
@@ -3235,7 +3237,7 @@ def index_impl(x, indices, check):
     )
 
 
-def index_impl_helper(x, indices, check):
+def index_impl_helper(x, indices, check, wrap_neg=True):
     assert isinstance(indices, (list, tuple))
     x_loader = x.make_loader()
     indices, tensor_indices = check_and_broadcast_indices(indices, x.get_device())
@@ -3263,6 +3265,7 @@ def index_impl_helper(x, indices, check):
         indexed_size,
         None,
         check=check,
+        wrap_neg=wrap_neg,
     )
 
     def inner_fn(idx):
@@ -3442,7 +3445,9 @@ def index_put_impl_(self, indices, values, accumulate, check):
 
 @register_lowering(aten._unsafe_masked_index, type_promotion_kind=None)
 def _unsafe_masked_index(self, mask, indices, fill):
-    ranges, _, _unsafe_index_fn = index_impl_helper(self, indices, check=False)
+    ranges, _, _unsafe_index_fn = index_impl_helper(
+        self, indices, check=False, wrap_neg=False
+    )
     mask_loader = mask.make_loader()
     self_loader = self.make_loader()
 

From 7e62ac51a125be92054f0961e5ac91e9c01288c1 Mon Sep 17 00:00:00 2001
From: Sam Larsen <slarsen@meta.com>
Date: Wed, 23 Oct 2024 22:55:00 +0000
Subject: [PATCH 020/161] [pt2] [testing] Skip inductor_freezing -
 test_cpp_wrapper_cuda internally (#138366)

Summary: It's been failing CI since probably forever; skip for now

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138366
Approved by: https://github.com/eellison
---
 test/inductor/test_inductor_freezing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
index aa967bac37c4c..c3c2c95d99aed 100644
--- a/test/inductor/test_inductor_freezing.py
+++ b/test/inductor/test_inductor_freezing.py
@@ -16,7 +16,7 @@
 from torch._inductor.utils import override_lowering, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater
-from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
 
 
 # Make the helper files in test/ importable
@@ -747,6 +747,7 @@ def foo(mod, inp):
                 self.assertEqual(foo(mod, x), mod_eager)
 
     @skipIfXpu
+    @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     def test_cpp_wrapper(self):
         mod = ConvBN(3, 32, kernel_size=3, stride=2).eval().to(self.device)
 

From 8f62832189f82bc904302a172809436bc28a54b4 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Thu, 24 Oct 2024 15:03:32 +0000
Subject: [PATCH 021/161] c10::nullopt -> std::nullopt (#138701)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138701
Approved by: https://github.com/Skylion007, https://github.com/malfet
---
 .ci/pytorch/test.sh                                   | 2 +-
 aten/src/ATen/SavedTensorHooks.cpp                    | 2 +-
 c10/core/StorageImpl.h                                | 2 +-
 torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp | 4 ++--
 torch/csrc/jit/serialization/pickle.cpp               | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index ad94aac8d4879..61a6dbef015c8 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -81,7 +81,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
   #
   # int main(int argv) {
   #   Tensor b = empty({3, 4});
-  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), c10::nullopt);
+  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), std::nullopt);
   # }
   export VALGRIND=OFF
 fi
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index b5733305ad069..871d9df0c924c 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -74,7 +74,7 @@ std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {
 std::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {
   // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
   if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
-    return c10::nullopt;
+    return std::nullopt;
   }
   return tls.stack.top();
 }
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index 1140ed97f93de..257ddea005125 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -23,7 +23,7 @@ C10_API void warnDeprecatedDataPtr();
 // Currently used only for storing a custom error message
 // used when throwing an exception when data_ptr is accessed.
 struct C10_API StorageExtraMeta {
-  std::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
+  std::optional<std::string> custom_data_ptr_error_msg_ = std::nullopt;
   StorageExtraMeta() = default;
   StorageExtraMeta(const StorageExtraMeta& other) {
     if (other.custom_data_ptr_error_msg_) {
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
index c65acf66c92bf..62365e676d63a 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
@@ -192,7 +192,7 @@ void OSSProxyExecutor::prefill_stack_with_static_arguments(
           schema_arg_type->castRaw<at::OptionalType>()->getElementType();
 
       if (serialized_arg_type == "as_none") {
-        stack.emplace_back(c10::nullopt);
+        stack.emplace_back(std::nullopt);
         if (inner_type->kind() == c10::TypeKind::TensorType) {
           // Tensor is None
           dynamic_args.emplace_back(index, DynamicArgType::TensorType, 0);
@@ -414,7 +414,7 @@ void OSSProxyExecutor::call_function(
                 flatten_tensor_args[tensor_id++]);
             optional_tensor_list.emplace_back(*tensor);
           } else if (item_type == "as_none") {
-            optional_tensor_list.emplace_back(c10::nullopt);
+            optional_tensor_list.emplace_back(std::nullopt);
           }
         }
         stack[arg_index] = optional_tensor_list;
diff --git a/torch/csrc/jit/serialization/pickle.cpp b/torch/csrc/jit/serialization/pickle.cpp
index f2791d14304b3..4bf6189a5bf59 100644
--- a/torch/csrc/jit/serialization/pickle.cpp
+++ b/torch/csrc/jit/serialization/pickle.cpp
@@ -155,7 +155,7 @@ c10::IValue pickle_load_obj(std::string_view data) {
       /*tensor_prefix=*/"",
       /*type_resolver=*/customClassResolver,
       /*obj_loader=*/torch::jit::ObjLoaderFunc,
-      /*device=*/c10::nullopt,
+      /*device=*/std::nullopt,
       reader);
 #else
   TORCH_CHECK(

From f7bb11dcc2de3027834c3b49857ff42f672ce164 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Thu, 24 Oct 2024 15:33:58 +0000
Subject: [PATCH 022/161] [cuDNN][cuDNN Frontend] Check in test for previously
 broken dBias check (#138725)

see https://github.com/pytorch/pytorch/issues/137347, let's try to land before https://github.com/pytorch/pytorch/pull/138709

CC @malfet @drisspg @Skylion007

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138725
Approved by: https://github.com/Skylion007, https://github.com/drisspg
---
 test/test_transformers.py | 40 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/test/test_transformers.py b/test/test_transformers.py
index 0329d86fafb23..737f830fef329 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -2495,6 +2495,46 @@ def test_cudnn_attention_trivial_output_transpose(self, device):
         o.backward(o)
         torch.testing.assert_close(x.grad, x_cpu.grad.cuda(), atol=7e-3, rtol=7e-3)
 
+    @skipIfRocm  # No cuDNN Attention
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
+    def test_cudnn_attention_nonmodulo64seqlen(self, device):
+        # see also: https://github.com/pytorch/pytorch/issues/137347
+        mask = torch.randint(0, 2, (2, 1, 157, 6404)).to(device="cuda", dtype=torch.bool)
+        q = torch.randn(2, 32, 157, 128, device='cuda', dtype=torch.bfloat16, requires_grad=True)
+        k = torch.randn(2, 32, 6404, 128, device='cuda', dtype=torch.bfloat16, requires_grad=True)
+        v = torch.randn(2, 32, 6404, 128, device='cuda', dtype=torch.bfloat16, requires_grad=True)
+        q_cpu = q.detach().clone().cpu()
+        k_cpu = k.detach().clone().cpu()
+        v_cpu = v.detach().clone().cpu()
+        q_cpu.requires_grad = True
+        k_cpu.requires_grad = True
+        v_cpu.requires_grad = True
+        mask_cpu = mask.detach().clone().cpu()
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
+            out = nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+        out_cpu = nn.functional.scaled_dot_product_attention(
+            q_cpu,
+            k_cpu,
+            v_cpu,
+            attn_mask=mask_cpu,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+
+        out.sum().backward()
+        out_cpu.sum().backward()
+
+        torch.testing.assert_close(q.grad, q_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
+        torch.testing.assert_close(k.grad, k_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
+        torch.testing.assert_close(v.grad, v_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
     def test_mem_efficient_attention_mask_variants(self, device, mask_dim: List[int]):

From b021486405de45e184b34c4eeeba7c3b6cf2da73 Mon Sep 17 00:00:00 2001
From: Irem Yuksel <113098562+iremyux@users.noreply.github.com>
Date: Thu, 24 Oct 2024 16:10:44 +0000
Subject: [PATCH 023/161] Enable Windows Arm64 (#133088)

This PR enables Pytorch for Windows on Arm64 - CPU only.
Currently, there aren't any checks in place to build and test for Windows on Arm64, but we're working to implement those as soon as possible.
We recommend using [Arm Performance Libraries (APL)](https://developer.arm.com/Tools%20and%20Software/Arm%20Performance%20Libraries) as a BLAS option, which is introduced in this PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/133088
Approved by: https://github.com/malfet

Co-authored-by: cristian panaite <panaite.cristian2000@gmail.com>
Co-authored-by: Stefan-Alin Pahontu <56953855+alinpahontu2912@users.noreply.github.com>
Co-authored-by: Ozan Aydin <148207261+ozanMSFT@users.noreply.github.com>
---
 aten/src/ATen/CMakeLists.txt                |  2 +-
 aten/src/ATen/native/BatchLinearAlgebra.cpp | 65 ++++++++++++++++++++-
 caffe2/CMakeLists.txt                       |  5 +-
 cmake/Dependencies.cmake                    | 10 +++-
 cmake/Modules/FindAPL.cmake                 | 58 ++++++++++++++++++
 cmake/Modules/FindLAPACK.cmake              | 28 +++++++++
 torch/_inductor/cpp_builder.py              |  2 +-
 torch/utils/cpp_extension.py                |  3 +-
 8 files changed, 166 insertions(+), 7 deletions(-)
 create mode 100644 cmake/Modules/FindAPL.cmake

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index a9dfe05f852e1..bcfaff434c1bd 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -428,7 +428,7 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$")
   list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo)
 endif()
 
-if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
+if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE AND NOT (MSVC AND CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64"))
   if(NOT MSVC)
     # Bump up optimization level for sleef to -O1, since at -O0 the compiler
     # excessively spills intermediate vector registers to the stack
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 8b8cf4090ba8f..1df22fb451f6e 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -132,11 +132,46 @@ extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *inf
 extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
 
 // potrs
+#if defined(_WIN32) && defined(_M_ARM64)
+
+// The functions zpotrs, cpotrs, dpotrs, and spotrs are not directly available in LAPACKE on Windows on ARM,
+// so we need to have wrapper functions to call them.
+// The issue on ARM platform can be found below:
+// https://community.arm.com/support-forums/f/high-performance-computing-forum/56512/unable-to-use-lapack---potrs-functions
+
+#define LAPACK_COL_MAJOR 102
+#define LAPACK_ROW_MAJOR 101
+
+extern "C" int LAPACKE_zpotrs(int matrix_layout, char uplo, int n, int nrhs, const std::complex<double> *a, int lda, std::complex<double> *b, int ldb);
+extern "C" int LAPACKE_cpotrs(int matrix_layout, char uplo, int n, int nrhs, const std::complex<float> *a, int lda, std::complex<float> *b, int ldb);
+extern "C" int LAPACKE_dpotrs(int matrix_layout, char uplo, int n, int nrhs, const double *a, int lda, double *b, int ldb);
+extern "C" int LAPACKE_spotrs(int matrix_layout, char uplo, int n, int nrhs, const float *a, int lda, float *b, int ldb);
+
+static inline void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb, int *info) {
+  *info = LAPACKE_zpotrs(LAPACK_COL_MAJOR, *uplo, *n, *nrhs, a, *lda, b, *ldb);
+}
+
+static inline void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb, int *info) {
+  *info = LAPACKE_cpotrs(LAPACK_COL_MAJOR, *uplo, *n, *nrhs, a, *lda, b, *ldb);
+}
+
+static inline void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info){
+  *info = LAPACKE_dpotrs(LAPACK_COL_MAJOR, *uplo, *n, *nrhs, a, *lda, b, *ldb);
+}
+
+static inline void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info) {
+  *info = LAPACKE_spotrs(LAPACK_COL_MAJOR, *uplo, *n, *nrhs, a, *lda, b, *ldb);
+}
+
+#else
+
 extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a, int *lda, std::complex<double> *b, int *ldb, int *info);
 extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a, int *lda, std::complex<float> *b, int *ldb, int *info);
 extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
 extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
 
+#endif
+
 // potrf
 extern "C" void zpotrf_(char *uplo, int *n, std::complex<double> *a, int *lda, int *info);
 extern "C" void cpotrf_(char *uplo, int *n, std::complex<float> *a, int *lda, int *info);
@@ -284,11 +319,39 @@ extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
 
 // ormqr
+#if defined(_WIN32) && defined(_M_ARM64)
+
+// The functions zunmqr, cunmqr, dormqr, and sormqr are not directly available in LAPACKE on Windows on ARM,
+// so we need to have wrapper functions to call them.
+// The issue on ARM platform can be found below:
+// https://community.arm.com/support-forums/f/high-performance-computing-forum/56512/unable-to-use-lapack---potrs-functions
+
+extern "C" int LAPACKE_zunmqr_work(int matrix_layout, char side, char trans, int m, int n, int k, const std::complex<double> *a, int lda, const std::complex<double> *tau, std::complex<double> *c, int ldc, std::complex<double> *work, int lwork);
+extern "C" int LAPACKE_cunmqr_work(int matrix_layout, char side, char trans, int m, int n, int k, const std::complex<float> *a, int lda, const std::complex<float> *tau, std::complex<float> *c, int ldc, std::complex<float> *work, int lwork);
+extern "C" int LAPACKE_dormqr_work(int matrix_layout, char side, char trans, int m, int n, int k, const double *a, int lda, const double *tau, double *c, int ldc, double *work, int lwork);
+extern "C" int LAPACKE_sormqr_work(int matrix_layout, char side, char trans, int m, int n, int k, const float *a, int lda, const float *tau, float *c, int ldc, float *work, int lwork);
+
+static inline void zunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *c, int *ldc, std::complex<double> *work, int *lwork, int *info) {
+    *info = LAPACKE_zunmqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork);
+}
+
+static inline void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *c, int *ldc, std::complex<float> *work, int *lwork, int *info) {
+    *info = LAPACKE_cunmqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork);
+}
+
+static inline void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info) {
+    *info = LAPACKE_dormqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork);
+}
+
+static inline void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info) {
+    *info = LAPACKE_sormqr_work(LAPACK_COL_MAJOR, *side, *trans, *m, *n, *k, a, *lda, tau, c, *ldc, work, *lwork);
+}
+#else
 extern "C" void zunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<double> *a, int *lda, std::complex<double> *tau, std::complex<double> *c, int *ldc, std::complex<double> *work, int *lwork, int *info);
 extern "C" void cunmqr_(char *side, char *trans, int *m, int *n, int *k, std::complex<float> *a, int *lda, std::complex<float> *tau, std::complex<float> *c, int *ldc, std::complex<float> *work, int *lwork, int *info);
 extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
 extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
-
+#endif
 // syevd
 extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info);
 extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *lrwork, int *iwork, int *liwork, int *info);
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index f9eb6fe2b3832..32ad2037febdc 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1719,7 +1719,10 @@ if(BUILD_TEST)
           endif()
         else()
           add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
-          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library sleef gtest_main)
+          target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
+          if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
+            target_link_libraries(${test_name}_${CPU_CAPABILITY} sleef)
+          endif()
         endif()
         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<INSTALL_INTERFACE:include>)
         target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 6afb8720d090f..19667b73287ca 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -161,7 +161,7 @@ else()
   set(AT_MKLDNN_ENABLED 0)
   set(AT_MKL_ENABLED 0)
 endif()
-set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib")
+set_property(CACHE BLAS PROPERTY STRINGS "ATLAS;BLIS;Eigen;FLAME;Generic;MKL;OpenBLAS;vecLib;APL")
 message(STATUS "Trying to find preferred BLAS backend of choice: " ${BLAS})
 
 if(BLAS STREQUAL "Eigen")
@@ -226,6 +226,12 @@ elseif(BLAS STREQUAL "FlexiBLAS")
   find_package(FlexiBLAS REQUIRED)
   include_directories(SYSTEM ${FlexiBLAS_INCLUDE_DIR})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${FlexiBLAS_LIB})
+elseif(BLAS STREQUAL "APL")
+  find_package(APL REQUIRED)
+  include_directories(SYSTEM ${APL_INCLUDE_DIR})
+  set(BLAS_INFO "apl")
+  set(BLAS_FOUND 1)
+  set(BLAS_LIBRARIES ${APL_LIBRARIES})
 elseif(BLAS STREQUAL "Generic")
   # On Debian family, the CBLAS ABIs have been merged into libblas.so
   if(ENV{GENERIC_BLAS_LIBRARIES} STREQUAL "")
@@ -246,7 +252,7 @@ endif()
 if(NOT INTERN_BUILD_MOBILE)
   set(AT_MKL_SEQUENTIAL 0)
   set(USE_BLAS 1)
-  if(NOT (ATLAS_FOUND OR BLIS_FOUND OR GENERIC_BLAS_FOUND OR MKL_FOUND OR OpenBLAS_FOUND OR VECLIB_FOUND OR FlexiBLAS_FOUND OR NVPL_BLAS_FOUND))
+  if(NOT (ATLAS_FOUND OR BLIS_FOUND OR GENERIC_BLAS_FOUND OR MKL_FOUND OR OpenBLAS_FOUND OR VECLIB_FOUND OR FlexiBLAS_FOUND OR NVPL_BLAS_FOUND OR APL_FOUND))
     message(WARNING "Preferred BLAS (" ${BLAS} ") cannot be found, now searching for a general BLAS library")
     find_package(BLAS)
     if(NOT BLAS_FOUND)
diff --git a/cmake/Modules/FindAPL.cmake b/cmake/Modules/FindAPL.cmake
new file mode 100644
index 0000000000000..7b97283b67f1f
--- /dev/null
+++ b/cmake/Modules/FindAPL.cmake
@@ -0,0 +1,58 @@
+# - Find APL (Arm Performance Libraries)
+#
+# This module sets the following variables:
+#   APL_INCLUDE_SEARCH_PATHS - list of paths to search for APL include files
+#   APL_LIB_SEARCH_PATHS - list of paths to search for APL libraries
+#   APL_FOUND - set to true if APL is found
+#   APL_INCLUDE_DIR - path to include dir.
+#   APL_LIB_DIR - path to include dir.
+#   APL_LIBRARIES - list of libraries for base APL
+
+SET(APL_INCLUDE_SEARCH_PATHS $ENV{ARMPL_DIR}/include)
+SET(APL_LIB_SEARCH_PATHS $ENV{ARMPL_DIR}/lib)
+
+SET(APL_FOUND ON)
+
+# Check include file
+FIND_PATH(APL_INCLUDE_DIR NAMES armpl.h PATHS ${APL_INCLUDE_SEARCH_PATHS})
+IF(NOT APL_INCLUDE_DIR)
+    SET(APL_FOUND OFF)
+    MESSAGE(STATUS "Could not verify APL include directory. Turning APL_FOUND off")
+ENDIF()
+
+# Check lib file
+FIND_PATH(APL_LIB_DIR NAMES libarmpl_lp64_mp.dll.lib libomp.dll.lib libarmpl_lp64_mp.a PATHS ${APL_LIB_SEARCH_PATHS})
+IF(NOT APL_LIB_DIR)
+    SET(APL_FOUND OFF)
+    MESSAGE(STATUS "Could not verify APL lib directory. Turning APL_FOUND off")
+ENDIF()
+
+IF (APL_FOUND)
+  IF(WIN32)
+    set(APL_LIBRARIES
+      "${APL_LIB_DIR}/libarmpl_lp64_mp.dll.lib"
+      "${APL_LIB_DIR}/libomp.dll.lib"
+    )
+  ELSEIF(UNIX)
+    set(APL_LIBRARIES
+      "${APL_LIB_DIR}/libarmpl_lp64_mp.a"
+    )
+  ENDIF()
+  MESSAGE(STATUS "Found APL header: ${APL_INCLUDE_DIR}")
+  MESSAGE(STATUS "Found APL library: ${APL_LIB_DIR}")
+  message(STATUS "APL_LIBRARIES: ${APL_LIBRARIES}")
+  SET(CMAKE_REQUIRED_LIBRARIES ${APL_LIBRARIES})
+  include(CheckCSourceRuns)
+  CHECK_C_SOURCE_RUNS("
+#include <stdlib.h>
+#include <stdio.h>
+float x[4] = { 1, 2, 3, 4 };
+float y[4] = { .1, .01, .001, .0001 };
+extern float cblas_sdot();
+int main() {
+  int i;
+  double r = cblas_sdot(4, x, 1, y, 1);
+  exit((float)r != (float).1234);
+}" BLAS_USE_CBLAS_DOT )
+  MESSAGE(STATUS "BLAS_USE_CBLAS_DOT: ${BLAS_USE_CBLAS_DOT}")
+ENDIF (APL_FOUND)
\ No newline at end of file
diff --git a/cmake/Modules/FindLAPACK.cmake b/cmake/Modules/FindLAPACK.cmake
index dbe47d6cdcf19..7d343f8adab7f 100644
--- a/cmake/Modules/FindLAPACK.cmake
+++ b/cmake/Modules/FindLAPACK.cmake
@@ -223,6 +223,34 @@ if(BLAS_FOUND)
     endif(LAPACK_LIBRARIES)
   endif()
 
+  #Arm Performance Libraries
+  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "apl"))
+    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+    check_function_exists("cheev_" APL_LAPACK_WORKS)
+    if(APL_LAPACK_WORKS)
+      check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
+      if(NOT LAPACK_CGESDD_WORKS)
+        find_library(GFORTRAN_LIBRARY
+          NAMES libgfortran.a gfortran
+          PATHS ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES})
+        list(APPEND CMAKE_REQUIRED_LIBRARIES "${GFORTRAN_LIBRARY}")
+        unset(LAPACK_CGESDD_WORKS CACHE)
+        check_function_exists("cgesdd_" LAPACK_CGESDD_WORKS)
+        if(LAPACK_CGESDD_WORKS)
+          list(APPEND LAPACK_LIBRARIES "${GFORTRAN_LIBRARY}")
+        else()
+          message(WARNING "APL has been compiled with Lapack support, but cgesdd can not be used")
+          set(APL_LAPACK_WORKS NO)
+        endif()
+      endif()
+    endif()
+    set(CMAKE_REQUIRED_LIBRARIES)
+    if(APL_LAPACK_WORKS)
+      SET(LAPACK_INFO "apl")
+    else()
+      message(STATUS "It seems APL has not been compiled with Lapack support")
+    endif()
+  endif()
 else(BLAS_FOUND)
   message(STATUS "LAPACK requires BLAS")
 endif(BLAS_FOUND)
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index 06fc9a46e91e1..866be87904515 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -779,7 +779,7 @@ def _get_torch_related_args(
         if not aot_mode:
             libraries.append("torch_python")
 
-    if _IS_WINDOWS:
+    if _IS_WINDOWS and platform.machine().lower() != "arm64":
         libraries.append("sleef")
 
     return include_dirs, libraries_dirs, libraries
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 6e0317ef14f39..e1e260bd6448d 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -4,6 +4,7 @@
 import importlib
 import importlib.abc
 import os
+import platform
 import re
 import shlex
 import shutil
@@ -994,7 +995,7 @@ def CppExtension(name, sources, *args, **kwargs):
     libraries.append('torch')
     libraries.append('torch_cpu')
     libraries.append('torch_python')
-    if IS_WINDOWS:
+    if IS_WINDOWS and platform.machine().lower() != "arm64":
         libraries.append("sleef")
 
     kwargs['libraries'] = libraries

From fe458eef8032f993b155d836538c60b5beddc569 Mon Sep 17 00:00:00 2001
From: Shuqiang Zhang <sqzhang@meta.com>
Date: Wed, 23 Oct 2024 18:34:15 -0700
Subject: [PATCH 024/161] [c10d] fix a logic of using ncclCommSplit (#138781)

Summary:
Currently, whether split should be used depends on the size of subgroup.
It's possible that default PG is not eagerly initialized yet, but split is still
called.

This PR fixes this issue by removing split's  dependency on subgroup size
Test Plan:
Modified UT
Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138781
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_nccl.py    | 17 +++++++++--------
 torch/distributed/distributed_c10d.py | 11 +++--------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 84452ac08a625..2626d694dbc11 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -626,8 +626,9 @@ def test_abort_in_destroy_multi_pgs(self):
         new_pg1.allreduce(t1).wait()
         new_pg2.allreduce(t2).wait()
         backend = pg._get_backend(torch.device(device))
-        # default PG's backend should have a split count of 2
-        self.assertEqual(backend.comm_split_count(), 2)
+        # default PG's backend should have a split count of 0 because
+        # it's not eager initialized
+        self.assertEqual(backend.comm_split_count(), 0)
         # shutdown all NCCL PGs in one shot
         dist.destroy_process_group()
 
@@ -649,8 +650,8 @@ def test_abort_in_destroy_mixed_empty_pgs(self):
 
         new_pg2.allreduce(t2).wait()
         backend = pg._get_backend(torch.device(device))
-        # default PG's backend should have a split count of 1
-        self.assertEqual(backend.comm_split_count(), 1)
+        # default PG's backend should have a split count of 0
+        self.assertEqual(backend.comm_split_count(), 0)
         # shutdown all NCCL PGs in one shot
         dist.destroy_process_group()
 
@@ -802,7 +803,7 @@ def test_extend_nccl_pg_timeout(self, backend):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_comm_split_optimization(self):
+    def test_comm_lazy_init_split(self):
         # Test the optimization of new groups that contain all world
         # ranks use the "transparent" `ncclCommSplit` optimization.
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -820,9 +821,9 @@ def test_comm_split_optimization(self):
             pg.broadcast(tensor, 0)
             self.assertEqual(backend.comm_split_count(), 0)
 
-            # The new group will force a split of the original on first use.
+            # The new group will not force a split because it is a lazy init.
             ng.broadcast(tensor, 0)
-            self.assertEqual(backend.comm_split_count(), 1)
+            self.assertEqual(backend.comm_split_count(), 0)
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -913,7 +914,7 @@ def test_non_blocking_init(self):
         self.assertEqual(backend.comm_split_count(), 0)
         broadcast_tensor = torch.tensor([self.rank]).cuda(device)
         new_pg.broadcast(broadcast_tensor, 0).wait()
-        self.assertEqual(backend.comm_split_count(), 1)
+        self.assertEqual(backend.comm_split_count(), 0)
         dist.destroy_process_group()
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index db2285e958342..9942d6ddec485 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1776,14 +1776,9 @@ def _new_process_group_helper(
     # communicators based on pre-existing ones, which can save
     # initialization time.  Due to lazy initialization of
     # communicators in some backends, we have to be careful and only
-    # split when we *know* the backends already are connected _on all
-    # ranks_.  We can only know this if the group we are making is the
-    # entire world or if we have bound a device id to the world (which
-    # causes early connection initialization).
-    if is_initialized() and (
-        len(global_ranks_in_group) == _get_default_group().size()
-        or _get_default_group().bound_device_id
-    ):
+    # split when we *know* the default PG has already started communicator initialization.
+    # We know this if we have bound a device id to the default pg (eager initialized).
+    if is_initialized() and _get_default_group().bound_device_id:
         split_from = _get_split_source(_get_default_group())
     else:
         split_from = None

From 5ea67778619c31b13644914deef709199052ee55 Mon Sep 17 00:00:00 2001
From: IvanKobzarev <ivan.kobzarev@gmail.com>
Date: Wed, 23 Oct 2024 05:28:34 -0700
Subject: [PATCH 025/161] [subclass] Unwrap_tensor_subclasses micro
 optimization (#138498)

unwrap_tensor_subclasses -> get_plain_tensors

Is used at runtime. For small models this overhead is feasible in comparison with small compiled kernel.

1/ Removing asserts  from runtime path
2/ Removing list creation with using optional output list to append argument
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138498
Approved by: https://github.com/bdhirsh
---
 torch/_functorch/_aot_autograd/subclass_utils.py | 7 ++-----
 torch/_subclasses/fake_tensor.py                 | 9 +++++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
index ad8de0eac069f..62b6223440a33 100644
--- a/torch/_functorch/_aot_autograd/subclass_utils.py
+++ b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -113,12 +113,9 @@ def create_subclass_meta(
 # NOTE: this function is hot, since we unwrap tensor subclass inputs at runtime
 def unwrap_tensor_subclasses(wrapped_args, *, is_joint_structure: bool):
     def concat_inner_tensors_from_subclasses(xs):
-        xs_inner = []
+        xs_inner: List[Tensor] = []
         for x in xs:
-            if is_traceable_wrapper_subclass(x):
-                xs_inner.extend(get_plain_tensors(typing.cast(Tensor, x)))
-            else:
-                xs_inner.append(x)
+            get_plain_tensors(x, out_append_list=xs_inner)
         return xs_inner
 
     if is_joint_structure:
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index f59e6242c982f..df2fef74e127e 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -151,14 +151,15 @@ def unset_fake_temporarily() -> Generator[Optional[TorchDispatchMode], None, Non
             torch._C._set_dispatch_mode(old)
 
 
-def get_plain_tensors(subclass: Tensor) -> List[Tensor]:
-    assert is_traceable_wrapper_subclass(subclass)
-    plain_tensors: List[Tensor] = []
+def get_plain_tensors(
+    subclass: Tensor, out_append_list: Optional[List[Tensor]] = None
+) -> List[Tensor]:
+    # This function is used in Runtime, do not add redundant asserts
+    plain_tensors: List[Tensor] = [] if out_append_list is None else out_append_list
     todo = [subclass]
     while todo:
         curr = todo.pop()
         if not is_traceable_wrapper_subclass(curr):
-            assert isinstance(curr, Tensor)
             plain_tensors.append(curr)
             continue
 

From 8197e4c70dcbfed01e2b0439b09f83ed0cbd9666 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 24 Oct 2024 17:27:06 +0000
Subject: [PATCH 026/161] Revert "[sparse] add search for optimal alg_id to
 torch.compile (#137427)"

This reverts commit 39bfba3f561e3125ce035de0bf90c8c7bcccd3ce.

Reverted https://github.com/pytorch/pytorch/pull/137427 on behalf of https://github.com/jcaip due to this PR breaks AO tests ([comment](https://github.com/pytorch/pytorch/pull/137427#issuecomment-2435906592))
---
 aten/src/ATen/native/native_functions.yaml    |   2 +-
 .../ATen/native/sparse/cuda/cuSPARSELtOps.cpp |  72 ++---
 .../ATen/native/sparse/cuda/cuSPARSELtOps.h   |  58 ----
 .../benchmark_semi_structured_sparsity.py     | 253 ++++++++++++++++++
 test/test_sparse_semi_structured.py           |  24 +-
 torch/_inductor/kernel/mm.py                  | 126 +--------
 torch/_meta_registrations.py                  |  30 +--
 torch/backends/cusparselt/__init__.py         |  13 +-
 torch/csrc/cuda/shared/cusparselt.cpp         |  31 +--
 torch/sparse/_semi_structured_ops.py          |  48 +---
 10 files changed, 314 insertions(+), 343 deletions(-)
 delete mode 100644 aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
 create mode 100644 benchmarks/sparse/benchmark_semi_structured_sparsity.py

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 01e090b2d370f..3625cd8712496 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3357,7 +3357,7 @@
   dispatch:
     CUDA: _cslt_compress
 
-- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, bool split_k_one_kernel=True) -> Tensor
+- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
   dispatch:
     CUDA: _cslt_sparse_mm
 
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index 8fb56ec40a755..ca3996f00e7a0 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -1,7 +1,20 @@
-#include <ATen/native/sparse/cuda/cuSPARSELtOps.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/CUDASparse.h>
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <c10/core/ScalarType.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Half.h>
+#include <cusparse.h>
+#include <cstdint>
 
 #if AT_CUSPARSELT_ENABLED()
 
+#include <cusparseLt.h>
+
 namespace at::native {
 
 // Ideally we would use the same DeviceThreadHandlePool mechanism as used in aten/src/ATen/cuda/CuSparseHandlePool.cpp
@@ -43,7 +56,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
         case at::ScalarType::Float8_e4m3fn:
             type = CUDA_R_8F_E4M3;
-            compression_factor = 10;
             break;
 #endif
         default:
@@ -91,7 +103,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
     return compressed_tensor;
 }
 
-std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
+std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
     const Tensor& compressed_A,
     const Tensor& dense_B,
     const std::optional<Tensor>& bias_opt,
@@ -99,8 +111,6 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
     const std::optional<c10::ScalarType> out_dtype_opt,
     bool transpose_result,
     int alg_id,
-    int split_k,
-    bool split_k_one_kernel,
     bool search_alg_id
 )
 {
@@ -159,7 +169,6 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
         output_type = CUDA_R_8F_E4M3;
         C_type = CUDA_R_16F;
         compute_type = CUSPARSE_COMPUTE_32F;
-        compression_factor = 10;
         break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@@ -326,21 +335,10 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
       &handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
 
-  // set matmul search params
+  // set alg_id
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
       &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg_id, sizeof(alg_id)));
 
-  cusparseLtSplitKMode_t splitKMode;
-  int max_alg_id;
-  if (split_k != 1) {
-     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
-      &handle, &alg_sel, CUSPARSELT_MATMUL_SPLIT_K, &split_k, sizeof(split_k)));
-
-    splitKMode = split_k_one_kernel ? CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL : CUSPARSELT_SPLIT_K_MODE_TWO_KERNELS;
-     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
-      &handle, &alg_sel, CUSPARSELT_MATMUL_SPLIT_K_MODE, &splitKMode, sizeof(splitKMode)));
-  }
-
   // set tensor_alpha_mode and alpha pointer for matmul
   const auto alpha_tensor = alpha_opt.has_value() ? *alpha_opt: Tensor{};
   auto alpha_ptr = &alpha;
@@ -383,23 +381,9 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
         &stream,
         1));
 
-    // get matmul params used
+    // get alg_id used
     TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgGetAttribute(
         &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg_id, sizeof(alg_id)));
-
-    TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
-                                       CUSPARSELT_MATMUL_SPLIT_K,
-                                       &split_k, sizeof(split_k)));
-
-    TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
-                                       CUSPARSELT_MATMUL_SPLIT_K_MODE,
-                                       &splitKMode, sizeof(splitKMode)));
-
-    TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
-                                       CUSPARSELT_MATMUL_ALG_CONFIG_MAX_ID,
-                                       &max_alg_id, sizeof(max_alg_id)));
-
-
   }
   else {
     // do normal matmul
@@ -427,7 +411,7 @@ std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
   // destroy plan
   TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
 
-  return {res, alg_id, split_k, splitKMode == CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL, max_alg_id};
+  return {alg_id, res};
 }
 
 at::Tensor _cslt_sparse_mm(
@@ -437,9 +421,7 @@ at::Tensor _cslt_sparse_mm(
     const std::optional<Tensor>& alpha_opt,
     const std::optional<c10::ScalarType> out_dtype_opt,
     bool transpose_result,
-    int64_t alg_id,
-    int64_t split_k,
-    bool split_k_one_kernel
+    int64_t alg_id
 )
 {
     auto result = _cslt_sparse_mm_impl(
@@ -450,10 +432,8 @@ at::Tensor _cslt_sparse_mm(
         out_dtype_opt,
         transpose_result,
         (int) alg_id,
-        (int) split_k,
-        split_k_one_kernel,
         false);
-    return std::get<0>(result);
+    return std::get<1>(result);
 }
 
 int64_t _cslt_sparse_mm_search(
@@ -465,10 +445,7 @@ int64_t _cslt_sparse_mm_search(
     bool transpose_result
 )
 {
-    TORCH_WARN_ONCE("torch._cslt_sparse_mm_search is deprecated and will be removed in a future PyTorch release. Please use torch._C._cusparselt.mm_search instead.");
     int alg_id_int = 0;
-    int split_k = 1;
-    bool split_k_one_kernel= true;
     auto result = _cslt_sparse_mm_impl(
         compressed_A,
         dense_B,
@@ -477,12 +454,11 @@ int64_t _cslt_sparse_mm_search(
         out_dtype_opt,
         transpose_result,
         alg_id_int,
-        split_k,
-        split_k_one_kernel,
         true);
-    return (int64_t) std::get<1>(result);
+    return (int64_t) std::get<0>(result);
 }
 
+
 } // namespace at::native
 
 #else // No cuSPARSELt support, throw error if these functions are called.
@@ -500,9 +476,7 @@ at::Tensor _cslt_sparse_mm(
     const std::optional<Tensor>& alpha_opt,
     const std::optional<c10::ScalarType> out_dtype,
     bool transpose_result,
-    int64_t alg_id,
-    int64_t split_k,
-    bool split_k_one_kernel)
+    int64_t alg_id)
 {
     TORCH_CHECK(false, "cuSPARSELt not supported on your machine.");
 }
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
deleted file mode 100644
index 00e7a8e1477d8..0000000000000
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDADataType.h>
-#include <ATen/cuda/CUDASparse.h>
-#include <ATen/cuda/CUDAConfig.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
-#include <ATen/Functions.h>
-#include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/util/Half.h>
-#include <cusparse.h>
-#include <cstdint>
-
-#if AT_CUSPARSELT_ENABLED()
-#include <cusparseLt.h>
-#endif
-
-namespace at::native {
-
-at::Tensor _cslt_compress(const Tensor& sparse_input);
-
-TORCH_CUDA_CPP_API std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
-    const std::optional<Tensor>& alpha_opt,
-    const std::optional<c10::ScalarType> out_dtype_opt,
-    bool transpose_result,
-    int alg_id,
-    int split_k,
-    bool split_k_one_kernel,
-    bool search_alg_id
-);
-
-at::Tensor _cslt_sparse_mm(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
-    const std::optional<Tensor>& alpha_opt,
-    const std::optional<c10::ScalarType> out_dtype_opt,
-    bool transpose_result,
-    int64_t alg_id,
-    int64_t split_k,
-    bool split_k_one_kernel
-);
-
-int64_t _cslt_sparse_mm_search(
-    const Tensor& compressed_A,
-    const Tensor& dense_B,
-    const std::optional<Tensor>& bias_opt,
-    const std::optional<Tensor>& alpha_opt,
-    const std::optional<c10::ScalarType> out_dtype_opt,
-    bool transpose_result
-);
-
-} // namespace at::native
diff --git a/benchmarks/sparse/benchmark_semi_structured_sparsity.py b/benchmarks/sparse/benchmark_semi_structured_sparsity.py
new file mode 100644
index 0000000000000..66311c40428fb
--- /dev/null
+++ b/benchmarks/sparse/benchmark_semi_structured_sparsity.py
@@ -0,0 +1,253 @@
+import argparse
+import random
+
+import pandas as pd
+from tqdm import tqdm
+
+import torch
+import torch.utils.benchmark as benchmark
+from torch import nn
+from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured
+
+
+torch.set_printoptions(
+    precision=2,
+    threshold=None,
+    edgeitems=16,
+    linewidth=480,
+    profile=None,
+    sci_mode=False,
+)
+
+
+# helper model definition for pruner
+class Model(nn.Module):
+    def __init__(self, m, k, dtype=None):
+        super().__init__()
+        # transposed so reversed
+        self.linear = nn.Linear(k, m)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+def rand_sparse_semi_structured_mask(
+    r, c, dtype=torch.float16, device="cuda", choice=None
+):
+    """
+    This function returns a 1:2 sparse matrix of size (r, c).
+    Note that this means this matrix will also be 2:4 and 4:8 sparse as well.
+    """
+
+    choices = [[0, 1], [1, 0]]
+    mask_entries = [choice or random.choice(choices) for i in range(r * c // 2)]
+
+    return (
+        torch.tensor(mask_entries, dtype=dtype, device=device)
+        .reshape(r, c)
+        .contiguous()
+    )
+
+
+def test_linear(m, k, n, dtype, contiguous, backend):
+    SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
+    mask = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
+    sparse_weight = torch.rand(m, k).to(dtype).cuda() * mask
+    input_tensor = torch.zeros(n, k).to(dtype).cuda()
+    model = Model(m, k).to(dtype).cuda().eval()
+
+    dense_measurement = benchmark.Timer(
+        stmt="model(input_tensor)",
+        globals=locals(),
+    ).blocked_autorange()
+
+    dense_output = model(input_tensor)
+    print(dense_output.shape)
+
+    # sparsify weights
+    model.linear.weight = nn.Parameter(
+        to_sparse_semi_structured(
+            sparse_weight,
+        )
+    )
+
+    sparse_output = model(input_tensor)
+    print(sparse_output.shape)
+
+    sparse_measurement = benchmark.Timer(
+        stmt="model(input_tensor)",
+        globals=locals(),
+    ).blocked_autorange()
+
+    correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)
+
+    return {
+        "test_function": "linear",
+        "m": m,
+        "k": k,
+        "n": n,
+        "dtype": str(dtype),
+        "backend": backend,
+        "sparse_latency (ms)": sparse_measurement.median * 1000,
+        "dense_latency (ms)": dense_measurement.median * 1000,
+        "speedup (d/s)": dense_measurement.median / sparse_measurement.median,
+        "correct": correct,
+        "contiguous": sparse_output.is_contiguous(),
+    }
+
+
+def test_tensor(m, k, n, dtype, contiguous, backend):
+    A = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
+    B = torch.zeros(k, n).to(dtype).cuda()
+    bias = torch.rand(n).to(dtype).cuda()
+
+    sA = to_sparse_semi_structured(A)
+
+    # torch.mm calculation
+    if dtype is not torch.int8:
+        dense_output = torch.mm(A, B)
+
+        dense_measurement = benchmark.Timer(
+            stmt="torch.mm(A, B)",
+            globals=locals(),
+        ).blocked_autorange()
+
+    else:
+        print("int8 baseline not supported")
+        dense_output = torch.mm(sA, B)
+
+        dense_measurement = benchmark.Timer(
+            stmt="torch.mm(sA, B)",
+            globals=locals(),
+        ).blocked_autorange()
+
+    sparse_output = torch.mm(sA, B)
+    sparse_measurement = benchmark.Timer(
+        stmt="torch.mm(sA, B)",
+        globals=locals(),
+    ).blocked_autorange()
+
+    correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)
+
+    return {
+        "test_function": "tensor",
+        "m": m,
+        "k": k,
+        "n": n,
+        "dtype": str(dtype),
+        "backend": backend,
+        "sparse_latency (ms)": sparse_measurement.median * 1000,
+        "dense_latency (ms)": dense_measurement.median * 1000,
+        "speedup (d/s)": dense_measurement.median / sparse_measurement.median,
+        "correct": correct,
+        "contiguous": sparse_output.is_contiguous(),
+    }
+
+
+if __name__ == "__main__":
+    dtype_lookup = {
+        "int8": torch.int8,
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "fp32": torch.float32,
+    }
+
+    parser = argparse.ArgumentParser(description="Semi-Structured Sparsity Benchmarks")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=[
+            "nvidia-bert",
+            "nvidia-fixed-k",
+            "nvidia-fixed-mn",
+        ],
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=dtype_lookup.keys(),
+        default="fp16",
+    )
+    parser.add_argument(
+        "--backend", type=str, choices=["cutlass", "cusparselt"], default="cusparselt"
+    )
+    parser.add_argument("-contiguous", action="store_true")
+    parser.add_argument("-e2e", action="store_true")
+    parser.add_argument("-save", action="store_true")
+    args = parser.parse_args()
+
+    if args.e2e:
+        eval_fn = test_linear
+    else:
+        eval_fn = test_tensor
+
+    print(f"Started benchmark: {args.mode} | dtype: {args.dtype}")
+    dtype = dtype_lookup[args.dtype]
+
+    if args.mode == "nvidia-bert":
+        bert_shapes = [
+            (3072, 1024, 16384),
+            (4096, 1024, 16384),
+            (1024, 1024, 16384),
+            (1024, 4096, 16384),
+        ]
+        results = (
+            eval_fn(m, k, n, dtype, args.contiguous, args.backend)
+            for (m, k, n) in tqdm(bert_shapes)
+        )
+
+    elif args.mode == "nvidia-fixed-k":
+        mn_vals = [
+            3072,
+            4096,
+            5120,
+            6144,
+            7168,
+            8192,
+            9216,
+            10240,
+            11264,
+            12288,
+            13312,
+            14336,
+            15360,
+            16384,
+            17408,
+            18432,
+            19456,
+            20480,
+        ]
+        results = (
+            eval_fn(mn, 10240, mn, dtype, args.contiguous, args.backend)
+            for mn in tqdm(mn_vals)
+        )
+
+    elif args.mode == "nvidia-fixed-mn":
+        k_vals = [
+            2560,
+            3840,
+            5120,
+            6400,
+            7680,
+            8960,
+            10240,
+            11520,
+            12800,
+            14080,
+            15360,
+            16640,
+            17920,
+            19200,
+            20480,
+        ]
+        results = (
+            eval_fn(10240, k, 10240, dtype, args.contiguous, args.backend)
+            for k in tqdm(k_vals)
+        )
+
+    df = pd.DataFrame.from_records(results)
+    if args.save:
+        save_file = f"{args.mode}_{args.dtype}_{args.backend}.csv"
+        df.to_csv(save_file)
+        print(f"Finished benchmark: {args.mode} saved results to {save_file}")
+    print(df)
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 52af386cd2bb4..2292dca8c9714 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -244,17 +244,18 @@ def test_mlp_contiguous_relu_compile_cutlass(self):
     @unittest.skipIf("cusparselt" not in SEMI_STRUCTURED_SUPPORTED_BACKENDS, "cusparselt not supported on this machine")
     def test_sp24_compile(self) -> None:
         x = torch.randn([1024, 512], device="cuda", dtype=torch.float16, requires_grad=True)
+        e = torch.eye(x.shape[0], x.shape[0], device="cuda", dtype=torch.float16)
 
-        def fn(x):
+        def fn(x, e):
             y = SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sort(x)
             y = y.t()
             return x @ y
 
         # Eager
-        output = fn(x)
+        output = fn(x, e)
         output.backward(output)
         # Torch compile
-        output = torch.compile(fn)(x)
+        output = torch.compile(fn)(x, e)
         output.backward(output)
 
 class TestSparseSemiStructured(TestCase):
@@ -1155,9 +1156,8 @@ def test_cslt_sparse_mm_alg_id(self, device, dtype):
         B = torch.ones((128, 128), device=device).to(dtype)
 
         A_compressed = torch._cslt_compress(A)
-        alg_id, split_k, split_k_one_kernel, _ = torch._C._cusparselt.mm_search(A_compressed, B.t(), None, None, None, False)
-        sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(),
-                                              alg_id=alg_id, split_k=split_k, split_k_one_kernel=split_k_one_kernel)
+        alg_id = torch._cslt_sparse_mm_search(A_compressed, B.t())
+        sparse_result = torch._cslt_sparse_mm(A_compressed, B.t(), alg_id=alg_id)
 
         dense_result = torch.mm(A.to(torch.float32), B.to(torch.float32))
         dense_result = dense_result.to(dtype)
@@ -1174,16 +1174,6 @@ def test_cslt_sparse_mm_search(self, device, dtype):
         alg_id = torch._cslt_sparse_mm_search(A_compressed, B.t())
         assert alg_id in range(torch.backends.cusparselt.get_max_alg_id())
 
-    @inference_dtypes
-    def test_csrc_cslt_sparse_mm_search(self, device, dtype):
-        A = rand_sparse_semi_structured_mask(256, 128, dtype=dtype)
-        A_compressed = torch._cslt_compress(A)
-        B = torch.ones((128, 128), device=device).to(dtype)
-
-        A_compressed = torch._cslt_compress(A)
-        alg_id, _, _, _ = torch._C._cusparselt.mm_search(A_compressed, B.t(), None, None, None, False)
-        assert alg_id in range(torch.backends.cusparselt.get_max_alg_id())
-
     def test_cusparselt_backend(self):
         version = _get_torch_cuda_version()
         assert torch.backends.cusparselt.is_available()
@@ -1191,11 +1181,9 @@ def test_cusparselt_backend(self):
         # CUDA 11.8 has cuSPARSELt v0.4.0 support
         if version == (11, 8):
             assert torch.backends.cusparselt.version() == 400
-            assert torch.backends.cusparselt.get_max_alg_id() == 4
         # CUDA 12.1 has cuSPARSELt v0.5.2 support
         elif version == (12, 1):
             assert torch.backends.cusparselt.version() == 502
-            assert torch.backends.cusparselt.get_max_alg_id() == 4
         # CUDA 12.4+ has cuSPARSELt v0.6.2 support
         elif version >= (12, 4):
             assert torch.backends.cusparselt.version() == 602
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 56107712a7a43..d7aed0214e951 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 import torch
 from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
@@ -144,12 +144,6 @@ def lazy_register_extern_choice(fn):
     has_out_variant=False,
 )
 
-aten__cslt_sparse_mm = ExternKernelChoice(
-    torch._cslt_sparse_mm,
-    "at::_cslt_sparse_mm",
-    has_out_variant=False,
-)
-
 
 def _is_int8_mat(mat):
     return mat.get_dtype() in (torch.int8, torch.uint8)
@@ -527,124 +521,6 @@ def tuned_sparse_semi_structured_mm(
     )
 
 
-@register_lowering(aten._cslt_sparse_mm, type_promotion_kind=None)
-def tuned_cslt_sparse_mm(
-    mat1_compressed,
-    mat2,
-    bias=None,
-    alpha=None,
-    out_dtype=None,
-    transpose_result=False,
-    alg_id=0,
-    split_k=1,
-    split_k_one_kernel=True,
-    layout=None,
-):
-    from torch._inductor.select_algorithm import AlgorithmSelectorCache, realize_inputs
-
-    mat1_compressed, mat2 = realize_inputs(mat1_compressed, mat2)
-    input_nodes: Tuple[Any, ...] = (mat1_compressed, mat2)
-    k, n = mat2.get_size()
-
-    is_8bit_input_type = mat1_compressed.dtype in [torch.int8, torch.float8_e4m3fn]
-    compression_factor = 10 if is_8bit_input_type else 9
-    m = (mat1_compressed.get_numel() * 16) // (compression_factor * k)
-
-    from torch._inductor.ir import FixedLayout
-
-    if transpose_result:
-        layout = FixedLayout(
-            mat2.get_device(),
-            out_dtype if out_dtype else mat2.get_dtype(),
-            [n, m],
-            [m, 1],
-        )
-    else:
-        layout = FixedLayout(
-            mat2.get_device(),
-            out_dtype if out_dtype else mat2.get_dtype(),
-            [m, n],
-            [n, 1],
-        )
-    # workaround for Inductor not supporting optional tensor input arguments
-    if bias is not None:
-        bias = realize_inputs(bias)
-        input_nodes = input_nodes + (bias,)
-
-    if alpha is not None:
-        alpha = realize_inputs(alpha)
-        input_nodes = input_nodes + (alpha,)
-
-    # cuSPARSELt alg_id search, not that we cannot use
-    # AlgorithmSelectorCache.benchmark_example_value() because this will return the base view
-    # and mat2 needs to have transpose properties preserved for cslt mm
-    (
-        searched_alg_id,
-        searched_split_k,
-        searched_split_k_one_kernel,
-        _,
-    ) = torch._C._cusparselt.mm_search(  # type: ignore[attr-defined]
-        AlgorithmSelectorCache.generate_example_value(
-            V.graph.sizevars.size_hints(mat1_compressed.get_size()),
-            V.graph.sizevars.size_hints(mat1_compressed.get_stride()),
-            mat1_compressed.get_device(),
-            mat1_compressed.dtype,
-            mat1_compressed.layout.offset,
-        ),
-        AlgorithmSelectorCache.generate_example_value(
-            V.graph.sizevars.size_hints(mat2.get_size()),
-            V.graph.sizevars.size_hints(mat2.get_stride()),
-            mat2.get_device(),
-            mat2.dtype,
-            mat2.layout.offset,
-        ),
-        AlgorithmSelectorCache.generate_example_value(
-            V.graph.sizevars.size_hints(bias.get_size()),
-            V.graph.sizevars.size_hints(bias.get_stride()),
-            bias.get_device(),
-            bias.dtype,
-            bias.layout.offset,
-        )
-        if bias is not None
-        else None,
-        AlgorithmSelectorCache.generate_example_value(
-            V.graph.sizevars.size_hints(alpha.get_size()),
-            V.graph.sizevars.size_hints(alpha.get_stride()),
-            alpha.get_device(),
-            alpha.dtype,
-            alpha.layout.offset,
-        )
-        if alpha is not None
-        else None,
-        out_dtype,
-        transpose_result,
-    )
-
-    baseline = aten__cslt_sparse_mm.bind(
-        input_nodes,
-        layout,
-        out_dtype=out_dtype,
-        alg_id=0,
-        split_k=1,
-        split_k_one_kernel=True,
-        transpose_result=transpose_result,
-    )
-    baseline.description = f"ALG_ID: 0 SPLIT_K: 1 SPLIT_K_ONE_KERNEL: True TRANSPOSE_RESULT: {transpose_result}"
-    searched = aten__cslt_sparse_mm.bind(
-        input_nodes,
-        layout,
-        out_dtype=out_dtype,
-        alg_id=searched_alg_id,
-        split_k=searched_split_k,
-        split_k_one_kernel=searched_split_k_one_kernel,
-        transpose_result=transpose_result,
-    )
-    searched.description = f"ALG_ID: {searched_alg_id} SPLIT_K: {searched_split_k} SPLIT_K_ONE_KERNEL: {searched_split_k_one_kernel} TRANSPOSE_RESULT: {transpose_result}"  # noqa: B950
-    choices = [baseline, searched]
-
-    return autotune_select_algorithm("cslt_sparse_mm", choices, input_nodes, layout)
-
-
 def fallback_mixed_mm(mat1, mat2, *, out):
     return torch.mm(mat1, mat2.to(mat1.dtype), out=out)
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index a022a75e8047b..0da6b58bdb413 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -520,44 +520,32 @@ def meta__cslt_sparse_mm(
     alpha: Optional[Tensor] = None,
     out_dtype: Optional[torch.dtype] = None,
     transpose_result: bool = False,
-    alg_id: int = 0,
-    split_k: int = 1,
-    split_k_one_kernel: bool = False,
 ):
     assert dense_B.dtype in {
         torch.float32,
         torch.float16,
         torch.bfloat16,
         torch.int8,
-        torch.float8_e4m3fn,
-    }, "_cslt_sparse_mm only supports fp16, bf16, int8, and fp8e4m3"
+    }, "_cslt_sparse_mm only supports fp16, bf16, and int8"
     assert compressed_A.dtype == dense_B.dtype, "inputs must have the same dtype"
     assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
 
-    is_8bit_input_type = compressed_A.dtype in [torch.int8, torch.float8_e4m3fn]
-    compression_factor = 10 if is_8bit_input_type else 9
+    is_int8_input_type = compressed_A.dtype == torch.int8
+    compression_factor = 10 if is_int8_input_type else 9
     k = dense_B.size(0)
     n = dense_B.size(1)
     m = (compressed_A.numel() * 16) // (compression_factor * k)
     if bias is not None:
         assert m == bias.size(0)
 
-    if is_8bit_input_type:
-        assert not dense_B.is_contiguous()
-
     if out_dtype is not None:
-        assert (
-            is_8bit_input_type
-            and out_dtype
-            in {
-                torch.float16,
-                torch.bfloat16,
-                torch.int32,
-                torch.float8_e4m3fn,
-            }
-        ), "out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!"
+        assert is_int8_input_type and out_dtype in {
+            torch.float16,
+            torch.bfloat16,
+            torch.int32,
+        }, "out_dtype is only supported for i8i8->fp16, bf16, or i32 matmul"
     output_shape = (n, m) if transpose_result else (m, n)
-    result = torch.empty(output_shape, dtype=out_dtype, device=compressed_A.device)
+    result = dense_B.new_empty(output_shape, dtype=out_dtype)
     return result
 
 
diff --git a/torch/backends/cusparselt/__init__.py b/torch/backends/cusparselt/__init__.py
index ebd33636f55be..da46274a2846d 100644
--- a/torch/backends/cusparselt/__init__.py
+++ b/torch/backends/cusparselt/__init__.py
@@ -25,12 +25,12 @@ def _init():
         global __MAX_ALG_ID
         if __cusparselt_version is None:
             __cusparselt_version = _cusparselt.getVersionInt()
-
-            # only way to get MAX_ALG_ID is to run a matmul
-            A = torch.zeros(128, 128, dtype=torch.float16).cuda()
-            A = torch._cslt_compress(A)
-            B = torch.zeros(128, 128, dtype=torch.float16).cuda()
-            _, _, _, __MAX_ALG_ID = _cusparselt.mm_search(A, B, None, None, None, False)  # type: ignore[attr-defined]
+            if __cusparselt_version == 400:
+                __MAX_ALG_ID = 4
+            elif __cusparselt_version == 502:
+                __MAX_ALG_ID = 5
+            elif __cusparselt_version == 602:
+                __MAX_ALG_ID = 37
         return True
 
 else:
@@ -52,7 +52,6 @@ def is_available() -> bool:
 
 
 def get_max_alg_id() -> Optional[int]:
-    r"""Return the maximum algorithm id supported by the current version of cuSPARSELt"""
     if not _init():
         return None
     return __MAX_ALG_ID
diff --git a/torch/csrc/cuda/shared/cusparselt.cpp b/torch/csrc/cuda/shared/cusparselt.cpp
index 02be708e91398..ca020b75a706f 100644
--- a/torch/csrc/cuda/shared/cusparselt.cpp
+++ b/torch/csrc/cuda/shared/cusparselt.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/utils/pybind.h>
 
 #ifdef USE_CUSPARSELT
-#include <ATen/native/sparse/cuda/cuSPARSELtOps.h>
+#include <cusparseLt.h>
 
 namespace {
 
@@ -9,34 +9,6 @@ size_t getVersionInt() {
   return CUSPARSELT_VERSION;
 }
 
-std::tuple<int64_t, int64_t, bool, int64_t> mmSearch(
-    const at::Tensor& compressed_A,
-    const at::Tensor& dense_B,
-    const std::optional<at::Tensor>& bias_opt,
-    const std::optional<at::Tensor>& alpha_opt,
-    const std::optional<c10::ScalarType> out_dtype_opt,
-    bool transpose_result) {
-  int alg_id_int = 0;
-  int split_k = 1;
-  bool split_k_one_kernel = true;
-  auto result = at::native::_cslt_sparse_mm_impl(
-      compressed_A,
-      dense_B,
-      bias_opt,
-      alpha_opt,
-      out_dtype_opt,
-      transpose_result,
-      alg_id_int,
-      split_k,
-      split_k_one_kernel,
-      true);
-  return {
-      (int64_t)std::get<1>(result),
-      (int64_t)std::get<2>(result),
-      (bool)std::get<3>(result),
-      (int64_t)std::get<4>(result)};
-}
-
 } // namespace
 
 namespace torch::cuda::shared {
@@ -45,7 +17,6 @@ void initCusparseltBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   auto cusparselt = m.def_submodule("_cusparselt", "libcusparselt.so bindings");
   cusparselt.def("getVersionInt", getVersionInt);
-  cusparselt.def("mm_search", mmSearch);
 }
 
 } // namespace torch::cuda::shared
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
index 9a5fdca947a06..eb5557bf8b0d4 100644
--- a/torch/sparse/_semi_structured_ops.py
+++ b/torch/sparse/_semi_structured_ops.py
@@ -103,8 +103,6 @@ def semi_sparse_detach(func, types, args, kwargs) -> torch.Tensor:
         packed_t=self.packed_t,
         meta_t=self.meta_t,
         compressed_swizzled_bitmask=self.compressed_swizzled_bitmask,
-        fuse_transpose_cusparselt=self.fuse_transpose_cusparselt,
-        alg_id_cusparselt=self.alg_id_cusparselt,
         requires_grad=False,
     )
 
@@ -179,37 +177,19 @@ def semi_sparse_scaled_mm(func, types, args=(), kwargs=None) -> torch.Tensor:
 
     assert A.dtype == torch.float8_e4m3fn
     assert B.dtype == torch.float8_e4m3fn
-    # cuSPARSELt lacks the A and B operand scaling support, so instead we use alpha to scale the result.
-    # Note that this limits us to per-tensor scalig only.
+    # only cuSPARSELt supports float8_e4m3fn currentl
+    assert isinstance(A, torch.sparse.SparseSemiStructuredTensorCUSPARSELT)
+    assert A.packed is not None
+    # Currently we only support per-tensor scaling, with float32 scales
     assert A_scale.numel() == 1 and B_scale.numel() == 1
     assert A_scale.dtype == torch.float32 and B_scale.dtype == torch.float32
-    # only cuSPARSELt supports float8_e4m3fn currentl
-    if isinstance(A, torch.sparse.SparseSemiStructuredTensorCUSPARSELT):
-        assert A.packed is not None
-        row, col = B.shape
-        B_padded = A._pad_dense_input(B).contiguous().t()
-        sparse_result = torch._cslt_sparse_mm(
-            A.packed,
-            B_padded,
-            alpha=A_scale * B_scale,
-            out_dtype=out_dtype,
-            bias=bias,
-        )
-        return sparse_result[:, :col]
-    else:
-        assert isinstance(B, torch.sparse.SparseSemiStructuredTensor)
-        assert B.packed is not None
-        row, col = A.shape
-        A_padded = B._pad_dense_input(A)
-        sparse_result = torch._cslt_sparse_mm(
-            B.packed,
-            A_padded.t(),
-            alpha=A_scale * B_scale,
-            out_dtype=out_dtype,
-            bias=bias,
-            transpose_result=B.fuse_transpose_cusparselt,
-        )
-        sparse_result = (
-            sparse_result if B.fuse_transpose_cusparselt else sparse_result.t()
-        )
-        return sparse_result[:row, :]
+
+    # cuSPARSELt lacks the A and B operand scaling support, so instead we use alpha to scale the result.
+    # Note that this limits us to per-tensor scalig only.
+    sparse_result = torch._cslt_sparse_mm(
+        A.packed,
+        B,
+        alpha=A_scale * B_scale,
+        out_dtype=out_dtype,
+    )
+    return sparse_result

From e7f1e306dfd1675e4fbe126ca01a9bbbeb49ecc5 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 24 Oct 2024 17:46:09 +0000
Subject: [PATCH 027/161] Revert "[c10d][Partial-Graph Overlap] Support calling
 .wait_tensor() within compiled region on output tensor of eager
 `async_op=True` collective (#137763)"

This reverts commit 362ca54f03f9bb72ba7633ed580fb788b1a8dea9.

Reverted https://github.com/pytorch/pytorch/pull/137763 on behalf of https://github.com/wdvr due to this change is breaking our prod training pipeline (verified with bisect) by increasing memory consumption 4x and causing OOM ([comment](https://github.com/pytorch/pytorch/pull/137763#issuecomment-2435962833))
---
 .../test_c10d_functional_native.py            |  18 ---
 test/distributed/test_c10d_nccl.py            |  42 ------
 test/distributed/test_inductor_collectives.py |  72 +--------
 torch/csrc/distributed/c10d/Functional.cpp    |  99 ++++++++++++-
 torch/csrc/distributed/c10d/Functional.hpp    |  10 +-
 torch/csrc/distributed/c10d/ProcessGroup.cpp  | 135 -----------------
 torch/csrc/distributed/c10d/ProcessGroup.hpp  | 138 +++---------------
 .../distributed/c10d/ProcessGroupGloo.cpp     |   7 -
 .../csrc/distributed/c10d/ProcessGroupMPI.cpp |   4 -
 .../distributed/c10d/ProcessGroupNCCL.cpp     |   3 -
 .../csrc/distributed/c10d/ProcessGroupUCC.cpp |   4 -
 torch/csrc/distributed/c10d/Work.cpp          |   6 +-
 torch/csrc/distributed/c10d/init.cpp          |   4 -
 13 files changed, 125 insertions(+), 417 deletions(-)

diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index b1c99145311c1..0f4bf91edc21b 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -405,22 +405,6 @@ def test_broadcast(self) -> None:
         assert output.eq(expect).all()
         assert output.completed
 
-    @skip_if_lt_x_gpu(2)
-    def test_wait_tensor(self) -> None:
-        self._init_process_group()
-
-        input = torch.full((10, 10), float(self.rank), device=self.device)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-        output = torch.ops._c10d_functional.all_reduce(
-            input,
-            "avg",
-            "default",
-        )
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
-        torch.ops._c10d_functional.wait_tensor(output)
-        # `wait_tensor(output)` will pop the work from the work registry immediately
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-
     @skip_if_lt_x_gpu(2)
     def test_unwaited(self) -> None:
         # Verify that the process can terminate gracefully
@@ -428,13 +412,11 @@ def test_unwaited(self) -> None:
         self._init_process_group()
 
         input = torch.full((10, 10), float(self.rank), device=self.device)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
         output = torch.ops._c10d_functional.all_reduce(
             input,
             "avg",
             "default",
         )
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
 
     @skip_if_lt_x_gpu(2)
     def test_py_work(self) -> None:
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 2626d694dbc11..2dcb0e1d2f066 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -3178,48 +3178,6 @@ def test_nccl_barrier_device_ids_function_argument(self):
         with self.assertRaisesRegex(TypeError, "Invalid function argument"):
             c10d.barrier(device_ids=self.rank)
 
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_unwaited(self) -> None:
-        # Verify that the process can terminate gracefully
-        # even with unwaited tensors
-        store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
-        )
-
-        input = torch.full((10240, 10240), float(self.rank), device=f"cuda:{self.rank}")
-        dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
-        # Running another collective on the same tensor should still work
-        dist.all_reduce(input, op=dist.ReduceOp.SUM, async_op=True)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 2)
-
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    def test_wait_tensor(self) -> None:
-        # Verify that c10d_functional.wait_tensor() can be invoked on
-        # output tensor of non-functional collective
-        store = c10d.FileStore(self.file_name, self.world_size)
-        c10d.init_process_group(
-            backend="nccl", rank=self.rank, world_size=self.world_size, store=store
-        )
-
-        input1 = torch.full((10, 10), float(self.rank), device=f"cuda:{self.rank}")
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-        dist.all_reduce(input1, op=dist.ReduceOp.SUM, async_op=True)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
-        torch.ops.c10d_functional.wait_tensor(input1)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-
-        input2 = torch.full((10, 10), float(self.rank), device=f"cuda:{self.rank}")
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-        work = dist.all_reduce(input2, op=dist.ReduceOp.SUM, async_op=True)
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 1)
-        work.wait()
-        self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-        self.assertEqual(input1, input2)
-
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     @with_dist_debug_levels(levels=["DETAIL"])
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 4c8650c1a88b8..f59c471a0f978 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import datetime
 import functools
 import unittest
 from unittest.mock import patch
@@ -15,7 +14,7 @@
 from torch._dynamo.testing import CompileCounter
 from torch._dynamo.utils import same
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
-from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
+from torch._inductor.utils import run_and_get_triton_code
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_distributed import (
@@ -29,7 +28,6 @@
     instantiate_parametrized_tests,
     parametrize,
     requires_cuda,
-    skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import HAS_GPU
 
@@ -247,74 +245,6 @@ def compile(func, example_inputs):
             )
             self.assertTrue(same(eager_out, inductor_out, tol=0.001))
 
-    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @skip_if_lt_x_gpu(2)
-    @skipIfRocm
-    def test_eager_async_allreduce_inductor_wait(self):
-        import torch.distributed as dist
-
-        def all_reduce_non_functional_eager(x):
-            y = x * x
-            work = dist.all_reduce(y, op=dist.ReduceOp.SUM, async_op=True)
-            assert isinstance(work, torch.distributed.Work)
-            return work, y
-
-        def all_reduce_wait(work, y):  # potentially compiled
-            if torch.compiler.is_dynamo_compiling():
-                torch.ops.c10d_functional.wait_tensor(y)
-            else:
-                work.wait(datetime.timedelta(seconds=10))
-            # Under compile, if `wait_tensor(y)` above is correctly executed,
-            # `y`'s data is in its final form and the output of this function will match eager;
-            # otherwise, `y * y` will run in parallel with `all_reduce(y)` and the output of this function
-            # will not match eager.
-            return y * y
-
-        with _dynamo_dist_per_rank_init(self.rank, self.world_size):
-            x = torch.ones(12800, 12800, device="cuda") + self.rank
-            self.assertEqual(torch._C._distributed_c10d._get_work_registry_size(), 0)
-
-            # NOTE: We run for 10 iterations each, to ensure that the GPU execution is way behind CPU
-            # and that `y * y` on CPU side will be issued before `all_reduce(y)` on GPU side is done,
-            # thus guaranteeing that in the bad case `y * y` on GPU side will run in parallel with `all_reduce(y)`
-            # thus will produce the wrong result that fails the unit test.
-
-            # Test: pure-eager
-            all_reduce_wait_eager = all_reduce_wait
-            for _ in range(10):
-                work, y = all_reduce_non_functional_eager(x)
-                self.assertEqual(
-                    torch._C._distributed_c10d._get_work_registry_size(), 1
-                )
-                out_ref = all_reduce_wait_eager(work, y)
-                # `work.wait()` will pop the work from the work registry immediately
-                self.assertEqual(
-                    torch._C._distributed_c10d._get_work_registry_size(), 0
-                )
-
-            # Test: issue comm in eager -> wait for comm in compile
-            all_reduce_wait_compiled = torch.compile(
-                all_reduce_wait,
-                backend="inductor",
-                fullgraph=True,
-            )
-            for _ in range(10):
-                work, y = all_reduce_non_functional_eager(x)
-                self.assertEqual(
-                    torch._C._distributed_c10d._get_work_registry_size(), 1
-                )
-                out_compiled, triton_codes = run_and_get_code(
-                    all_reduce_wait_compiled, work, y
-                )
-                # `wait_tensor(y)` will pop the work from the work registry immediately
-                self.assertEqual(
-                    torch._C._distributed_c10d._get_work_registry_size(), 0
-                )
-                FileCheck().check(
-                    "torch.ops._c10d_functional.wait_tensor.default("
-                ).run(triton_codes[0])
-            self.assertEqual(out_ref, out_compiled)
-
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_lt_x_gpu(2)
     @patch.object(torch._inductor.config, "allow_buffer_reuse", True)
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index a8c5d018778f3..1117718ee5093 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -6,10 +6,80 @@
 #include <torch/csrc/distributed/c10d/Functional.hpp>
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/RankLocal.hpp>
 #include <utility>
 
 namespace {
 
+class WorkRegistry {
+ public:
+  void register_work(
+      const at::Tensor& tensor,
+      const c10::intrusive_ptr<c10d::Work>& work) {
+    auto storage = tensor.storage().getWeakStorageImpl();
+    std::unique_lock lock(lock_);
+    auto [it, inserted] = registry_.try_emplace(std::move(storage), work);
+    TORCH_CHECK(
+        inserted || it->second != work,
+        "The tensor storage is already associated with another work.");
+  }
+
+  c10::intrusive_ptr<c10d::Work> pop_work(const at::Tensor& tensor) {
+    const auto storage = tensor.storage().getWeakStorageImpl();
+    std::unique_lock lock(lock_);
+    auto it = registry_.find(storage);
+    if (it == registry_.end()) {
+      return nullptr;
+    }
+    auto work = it->second;
+    registry_.erase(it);
+    return work;
+  }
+
+  ~WorkRegistry() {
+    // If there are still unwaited work objects, their corresponding process
+    // groups should have already been destroyed at this stage. Any attempts to
+    // wait for these work objects or to destroy them will only result in
+    // confusing errors. Therefore, we simply issue a warning and intentionally
+    // allow the unwaited work objects to leak.
+    if (!registry_.empty()) {
+      TORCH_WARN(
+          "At the time of process termination, there are still ",
+          registry_.size(),
+          " unwaited c10d_functional collective calls. "
+          "Please review your program to ensure c10d_functional.wait_tensor() "
+          "is invoked on all tensors returned from c10d_functional collective "
+          "ops before they are used.");
+    }
+    for (auto& it : registry_) {
+      it.second.release();
+    }
+  }
+
+ private:
+  std::unordered_map<
+      c10::weak_intrusive_ptr<c10::StorageImpl>,
+      c10::intrusive_ptr<c10d::Work>>
+      registry_;
+  std::mutex lock_;
+};
+
+static WorkRegistry process_registry;
+
+} // namespace
+
+namespace c10d {
+
+void register_work(
+    const at::Tensor& tensor,
+    const c10::intrusive_ptr<c10d::Work>& work) {
+  RankLocal<WorkRegistry>::get().register_work(tensor, work);
+}
+
+} // namespace c10d
+
+namespace {
+
 const std::unordered_map<std::string, c10d::ReduceOp> str_to_reduce_op = {
     {"sum", c10d::ReduceOp(c10d::ReduceOp::RedOpType::SUM)},
     {"avg", c10d::ReduceOp(c10d::ReduceOp::RedOpType::AVG)},
@@ -42,6 +112,7 @@ at::Tensor& all_reduce_(
   std::vector<at::Tensor> inputs{input};
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->allreduce(inputs, opts);
+  c10d::register_work(input, work);
   return input;
 }
 
@@ -64,6 +135,9 @@ std::vector<at::Tensor> all_reduce_coalesced_(
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->allreduce_coalesced(inputs, opts);
+  for (const auto& tensor : inputs) {
+    c10d::register_work(tensor, work);
+  }
   return inputs;
 }
 
@@ -104,6 +178,9 @@ std::vector<at::Tensor> all_gather_into_tensor_coalesced(
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->allgather_into_tensor_coalesced(outputs, inputs);
+  for (const auto& tensor : outputs) {
+    c10d::register_work(tensor, work);
+  }
   return outputs;
 }
 
@@ -125,6 +202,7 @@ at::Tensor& all_gather_into_tensor_out(
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->_allgather_base(output, input, opts);
+  c10d::register_work(output, work);
   return output;
 }
 
@@ -160,6 +238,9 @@ std::vector<at::Tensor> reduce_scatter_tensor_coalesced(
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->reduce_scatter_tensor_coalesced(outputs, inputs, opts);
+  for (const auto& tensor : outputs) {
+    c10d::register_work(tensor, work);
+  }
   return outputs;
 }
 
@@ -191,6 +272,7 @@ at::Tensor all_to_all_single(
       const_cast<at::Tensor&>(input),
       output_split_sizes,
       input_split_sizes);
+  c10d::register_work(output, work);
   return output;
 }
 
@@ -202,6 +284,7 @@ at::Tensor& broadcast_(at::Tensor& input, int64_t src, std::string group_name) {
 
   auto group = c10d::resolve_process_group(group_name);
   auto work = group->broadcast(inputs, opts);
+  c10d::register_work(input, work);
   return input;
 }
 
@@ -213,6 +296,14 @@ at::Tensor broadcast(
   return broadcast_(output, src, std::move(group_name));
 }
 
+at::Tensor wait_tensor(const at::Tensor& tensor) {
+  auto work = c10d::RankLocal<WorkRegistry>::get().pop_work(tensor);
+  if (work != nullptr) {
+    work->wait();
+  }
+  return tensor;
+}
+
 } // namespace
 
 TORCH_LIBRARY(_c10d_functional, m) {
@@ -298,7 +389,7 @@ TORCH_LIBRARY(_c10d_functional, m) {
   m.def(
       "wait_tensor(Tensor tensor) -> Tensor",
       torch::dispatch(
-          c10::DispatchKey::CompositeExplicitAutograd, c10d::wait_tensor),
+          c10::DispatchKey::CompositeExplicitAutograd, ::wait_tensor),
       {at::Tag::pt2_compliant_tag});
 }
 
@@ -347,7 +438,7 @@ class AllToAllSingle : public torch::autograd::Function<AllToAllSingle> {
     // TODO: track active cuda stream in wait
     out = c10::Dispatcher::singleton()
               .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
-              .typed<decltype(c10d::wait_tensor)>()
+              .typed<decltype(wait_tensor)>()
               .call(out);
 
     return {out, at::Tensor(), at::Tensor(), at::Tensor()};
@@ -402,7 +493,7 @@ class ReduceScatterTensor
     // TODO: track active cuda stream in wait
     out = c10::Dispatcher::singleton()
               .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
-              .typed<decltype(c10d::wait_tensor)>()
+              .typed<decltype(wait_tensor)>()
               .call(out);
 
     return {
@@ -458,7 +549,7 @@ class AllGatherIntoTensor
     // TODO: track active cuda stream in wait
     out = c10::Dispatcher::singleton()
               .findSchemaOrThrow("_c10d_functional::wait_tensor", "")
-              .typed<decltype(c10d::wait_tensor)>()
+              .typed<decltype(wait_tensor)>()
               .call(out);
 
     return {
diff --git a/torch/csrc/distributed/c10d/Functional.hpp b/torch/csrc/distributed/c10d/Functional.hpp
index e81d44b8dbd23..cbb19e686095a 100644
--- a/torch/csrc/distributed/c10d/Functional.hpp
+++ b/torch/csrc/distributed/c10d/Functional.hpp
@@ -1,3 +1,11 @@
 #pragma once
 
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+
+namespace c10d {
+
+C10_EXPORT void register_work(
+    const at::Tensor& tensor,
+    const c10::intrusive_ptr<c10d::Work>& work);
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index dffe20aebdd90..63d64447dfdb9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -1,6 +1,5 @@
 #include <ATen/ThreadLocalState.h>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-#include <torch/csrc/distributed/c10d/RankLocal.hpp>
 
 #include <c10/util/Logging.h>
 #include <fmt/format.h>
@@ -160,137 +159,3 @@ void ProcessGroup::release_resources() {
 }
 
 } // namespace c10d
-
-namespace {
-
-class WorkRegistry {
- public:
-  void register_work(
-      const at::Tensor& tensor,
-      const c10::intrusive_ptr<c10d::Work>& work) {
-    if (!tensor.has_storage()) {
-      TORCH_WARN_ONCE(
-          "Registering collective work for tensor without storage is not supported. "
-          "Calling c10d_functional.wait_tensor() on this tensor will not wait for the collective to complete. "
-          "Unsupported tensor type: " +
-          tensor.toString());
-      return;
-    }
-    auto storage = tensor.storage().getWeakStorageImpl();
-    std::unique_lock lock(lock_);
-
-    auto it = registry_.find(storage);
-    if (it == registry_.end()) {
-      registry_.emplace(
-          std::move(storage),
-          std::vector<c10::intrusive_ptr<c10d::Work>>{work});
-    } else {
-      // There is no guarantee that the previous work object for this
-      // tensor storage is completed before the new work object is registered.
-      // Therefore we need to maintain a list of work objects for each tensor
-      // storage.
-      it->second.push_back(work);
-    }
-  }
-
-  std::vector<c10::intrusive_ptr<c10d::Work>> pop_works(
-      const at::Tensor& tensor) {
-    const auto storage = tensor.storage().getWeakStorageImpl();
-    std::unique_lock lock(lock_);
-    auto it = registry_.find(storage);
-    if (it == registry_.end()) {
-      return {};
-    }
-    auto works = it->second;
-    registry_.erase(it);
-    return works;
-  }
-
-  void unregister_work(const c10::intrusive_ptr<c10d::Work>& work) {
-    std::unique_lock lock(lock_);
-    for (auto it = registry_.begin(); it != registry_.end();) {
-      std::vector<c10::intrusive_ptr<c10d::Work>> nonmatching_works;
-      for (const auto& _work : it->second) {
-        if (_work != work) {
-          nonmatching_works.push_back(_work);
-        }
-      }
-      if (nonmatching_works.empty()) {
-        it = registry_.erase(it);
-      } else {
-        it->second = std::move(nonmatching_works);
-        ++it;
-      }
-    }
-  }
-
-  size_t get_work_registry_size() {
-    std::unique_lock lock(lock_);
-    size_t total_size = 0;
-    for (const auto& [storage, works] : registry_) {
-      total_size += works.size();
-    }
-    return total_size;
-  }
-
-  ~WorkRegistry() {
-    // If there are still unwaited work objects, their corresponding process
-    // groups should have already been destroyed at this stage. Any attempts to
-    // wait for these work objects or to destroy them will only result in
-    // confusing errors. Therefore, we simply issue a warning and intentionally
-    // allow the unwaited work objects to leak.
-    size_t registry_size = get_work_registry_size();
-    if (registry_size > 0) {
-      TORCH_WARN(
-          "At the time of process termination, there are still ",
-          registry_size,
-          " unwaited collective calls. "
-          "Please review your program to ensure that:\n"
-          "1. c10d_functional.wait_tensor() is invoked on all tensors returned from c10d_functional collective,\n"
-          "2. work.wait() is invoked on work object returned from torch.distributed collective with async_op=True,\n"
-          "before the output tensors of the collective are used.");
-    }
-    for (auto& it : registry_) {
-      for (auto& work : it.second) {
-        work.release();
-      }
-    }
-  }
-
- private:
-  std::unordered_map<
-      c10::weak_intrusive_ptr<c10::StorageImpl>,
-      std::vector<c10::intrusive_ptr<c10d::Work>>>
-      registry_;
-  std::mutex lock_;
-};
-
-static WorkRegistry process_registry;
-
-} // namespace
-
-namespace c10d {
-
-void register_work(
-    const at::Tensor& tensor,
-    const c10::intrusive_ptr<c10d::Work>& work) {
-  RankLocal<WorkRegistry>::get().register_work(tensor, work);
-}
-
-at::Tensor wait_tensor(const at::Tensor& tensor) {
-  auto works = RankLocal<WorkRegistry>::get().pop_works(tensor);
-  for (const auto& work : works) {
-    work->wait();
-  }
-  return tensor;
-}
-
-void unregister_work(const c10::intrusive_ptr<c10d::Work>& work) {
-  RankLocal<WorkRegistry>::get().unregister_work(work);
-}
-
-size_t get_work_registry_size() {
-  return RankLocal<WorkRegistry>::get().get_work_registry_size();
-}
-
-} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index febf885a112b3..463d1f046db52 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <torch/csrc/distributed/c10d/Work.hpp>
 #include <memory>
 #include <unordered_map>
 #include <utility>
@@ -24,16 +23,6 @@ constexpr auto kProcessGroupDefaultTimeout =
 
 namespace c10d {
 
-C10_EXPORT void register_work(
-    const at::Tensor& tensor,
-    const c10::intrusive_ptr<c10d::Work>& work);
-
-C10_EXPORT at::Tensor wait_tensor(const at::Tensor& tensor);
-
-C10_EXPORT void unregister_work(const c10::intrusive_ptr<c10d::Work>& work);
-
-C10_EXPORT size_t get_work_registry_size();
-
 // ProcessGroup is a base class that captures collective and point to
 // point communication in a fixed set of processes.
 //
@@ -169,18 +158,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     // It's awakward to unbox the opts here and box them again in the custom C++
     // op. But it's also complicated to make opts as a CustomClassHolder. Leave
     // it as it is now.
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.rootRank,
         opts.rootTensor,
         opts.asyncOp,
         opts.timeout.count()));
-
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> allreduce(
@@ -197,17 +181,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const std::optional<at::Tensor>& sparse_indices,
                     int64_t)>();
 
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.sparseIndices,
         opts.timeout.count()));
-
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> allreduce_coalesced(
@@ -221,16 +200,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ReduceOp>&,
                              int64_t)>();
 
-    auto work = op.call(
+    return op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.timeout.count());
-
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> reduce(
@@ -245,18 +219,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              int64_t,
                              int64_t,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<ReduceOp>(opts.reduceOp),
         opts.rootRank,
         opts.rootTensor,
         opts.timeout.count());
-
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> allgather(
@@ -273,18 +242,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t)>();
 
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.timeout.count()));
-
-    for (const auto& tensor_list : outputTensors) {
-      for (const auto& tensor : tensor_list) {
-        c10d::register_work(tensor, work);
-      }
-    }
-    return work;
   }
 
   // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
@@ -305,15 +267,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 bool,
                 int64_t)>();
 
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputBuffer,
         inputBuffer,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.asyncOp,
         opts.timeout.count()));
-
-    c10d::register_work(outputBuffer, work);
-    return work;
   }
 
   // This function is deprecated and will be moved out of ProcessGroup to comms:
@@ -332,17 +291,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const at::TensorList&,
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
 
-    auto work = op.call(
+    return op.call(
         outputTensorLists,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
-
-    for (const auto& tensor_list : outputTensorLists) {
-      for (const auto& tensor : tensor_list) {
-        c10d::register_work(tensor, work);
-      }
-    }
-    return work;
   }
 
   // This function is a coalesced version of `allgather_into_tensor` (currently
@@ -360,15 +312,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const at::TensorList,
                 const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
 
-    auto work = op.call(
+    return op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
-
-    for (const auto& tensor : outputTensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> gather(
@@ -383,19 +330,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.rootRank,
         opts.timeout.count());
-
-    for (const auto& tensor_list : outputTensors) {
-      for (const auto& tensor : tensor_list) {
-        c10d::register_work(tensor, work);
-      }
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> scatter(
@@ -413,18 +353,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     int64_t,
                     bool,
                     int64_t)>();
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.rootRank,
         opts.asyncOp,
         opts.timeout.count()));
-
-    for (const auto& tensor : outputTensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> reduce_scatter(
@@ -441,17 +376,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     const c10::intrusive_ptr<::c10d::ReduceOp>&,
                     int64_t)>();
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
         opts.timeout.count()));
-
-    for (const auto& tensor : outputTensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
@@ -468,16 +398,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const c10::intrusive_ptr<::c10d::ReduceOp>&,
                 bool,
                 int64_t)>();
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputBuffer,
         inputBuffer,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
         opts.asyncOp,
         opts.timeout.count()));
-
-    c10d::register_work(outputBuffer, work);
-    return work;
   }
 
   // This function is a coalesced version of `reduce_scatter_tensor` (currently
@@ -497,17 +424,12 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                 const c10::intrusive_ptr<::c10d::ReduceOp>&,
                 int64_t)>();
 
-    auto work = op.call(
+    return op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
         opts.timeout.count());
-
-    for (const auto& tensor : outputTensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> alltoall_base(
@@ -525,16 +447,13 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              std::vector<int64_t>,
                              std::vector<int64_t>,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         outputBuffer,
         inputBuffer,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         outputSplitSizes,
         inputSplitSizes,
         opts.timeout.count());
-
-    c10d::register_work(outputBuffer, work);
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> alltoall(
@@ -550,16 +469,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                     const at::TensorList&,
                     const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                     int64_t)>();
-    auto work = std::get<1>(op.call(
+    return std::get<1>(op.call(
         outputTensors,
         inputTensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.timeout.count()));
-
-    for (const auto& tensor : outputTensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual void monitoredBarrier(
@@ -635,15 +549,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         dstRank,
         tag);
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> recv(
@@ -657,15 +567,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         srcRank,
         tag);
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> recvAnysource(
@@ -677,14 +583,10 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              at::TensorList,
                              const c10::intrusive_ptr<::c10d::ProcessGroup>&,
                              int64_t)>();
-    auto work = op.call(
+    return op.call(
         tensors,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         tag);
-    for (const auto& tensor : tensors) {
-      c10d::register_work(tensor, work);
-    }
-    return work;
   }
 
   virtual c10::intrusive_ptr<Work> barrier(
@@ -716,13 +618,11 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
                              const std::vector<int64_t>&,
                              int64_t)>();
 
-    auto work = op.call(
+    return op.call(
         tensor,
         c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
         opts.device_ids,
         opts.timeout.count());
-    c10d::register_work(tensor, work);
-    return work;
   }
 
   bool hasBackends() {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 8ac81f4c396bd..3cb765a658912 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -5,7 +5,6 @@
 
 #include <torch/csrc/distributed/c10d/GlooDeviceFactory.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <chrono>
 #include <exception>
 
@@ -575,9 +574,6 @@ bool ProcessGroupGloo::SendWork::wait(std::chrono::milliseconds timeout) {
 
   // Completes the Work object and throws the exception.
   finishAndThrow(exception);
-  c10d::unregister_work(
-      c10::intrusive_ptr<
-          ProcessGroupGloo::SendWork>::unsafe_reclaim_from_nonowning(this));
   return sendCompleted;
 }
 
@@ -625,9 +621,6 @@ bool ProcessGroupGloo::RecvWork::wait(std::chrono::milliseconds timeout) {
 
   // Completes the Work object and throws the exception.
   finishAndThrow(exception);
-  c10d::unregister_work(
-      c10::intrusive_ptr<
-          ProcessGroupGloo::RecvWork>::unsafe_reclaim_from_nonowning(this));
   return recvCompleted;
 }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
index df6c3acda7054..91e9f938f1dd3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -7,7 +7,6 @@
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/util/irange.h>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 
 #if defined(OPEN_MPI) && OPEN_MPI
 #include <mpi-ext.h> // Needed for CUDA-aware check
@@ -199,9 +198,6 @@ bool ProcessGroupMPI::AsyncWork::wait(std::chrono::milliseconds /* unused */) {
     populateException();
     std::rethrow_exception(exception_);
   }
-  c10d::unregister_work(
-      c10::intrusive_ptr<
-          ProcessGroupMPI::AsyncWork>::unsafe_reclaim_from_nonowning(this));
   // Always return true, because abort API is not implemented.
   return true;
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index eb16d6e09c904..fbd69dd7fd97f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -720,9 +720,6 @@ void ProcessGroupNCCL::WorkNCCL::handleException(
 
 void ProcessGroupNCCL::WorkNCCL::synchronize() {
   synchronizeStream();
-  c10d::unregister_work(
-      c10::intrusive_ptr<
-          ProcessGroupNCCL::WorkNCCL>::unsafe_reclaim_from_nonowning(this));
 }
 
 void ProcessGroupNCCL::WorkNCCL::synchronizeStream() {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index c1937aaf52a60..dab6aa6d26ece 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -2,7 +2,6 @@
 
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/util/env.h>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #include <torch/csrc/distributed/c10d/UCCTracing.hpp>
 #include <torch/csrc/distributed/c10d/UCCUtils.hpp>
@@ -274,9 +273,6 @@ bool ProcessGroupUCC::WorkUCC::wait(std::chrono::milliseconds /* unused */) {
     Work::recordFunctionEndCallback_();
     Work::recordFunctionEndCallback_ = nullptr;
   }
-  c10d::unregister_work(
-      c10::intrusive_ptr<
-          ProcessGroupUCC::WorkUCC>::unsafe_reclaim_from_nonowning(this));
   return true;
 }
 
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
index af006e2d9857e..d7890566acbb3 100644
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -1,5 +1,4 @@
 #include <ATen/ThreadLocalState.h>
-#include <distributed/c10d/ProcessGroup.hpp>
 
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #include <utility>
@@ -71,10 +70,7 @@ std::vector<at::Tensor> Work::result() {
   TORCH_CHECK(false, "result() not implemented.");
 }
 
-void Work::synchronize() {
-  c10d::unregister_work(
-      c10::intrusive_ptr<Work>::unsafe_reclaim_from_nonowning(this));
-}
+void Work::synchronize() {}
 
 bool Work::wait(std::chrono::milliseconds timeout) {
   std::unique_lock<std::mutex> lock(mutex_);
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index f613bf0245502..67cf3b581b1fb 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -933,10 +933,6 @@ This class does not support ``__members__`` property.)");
       py::arg("tensor"),
       py::arg("work"));
 
-  module.def("_get_work_registry_size", []() {
-    return ::c10d::get_work_registry_size();
-  });
-
   // Remove a group from the native registry
   module.def(
       "_unregister_process_group",

From 239a21f37e5a83eac82aed825b6adb99a791ecd9 Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@fb.com>
Date: Wed, 23 Oct 2024 12:03:11 -0700
Subject: [PATCH 028/161] [Inductor] don't set XBLOCK larger than xnumel
 (#138730)

When fp8 dtype is involved, Inductor may set min_elem_per_thread to be a positive value. This will force increasing XBLOCK even for a small xnumel (e.g. 1). Inductor will report an error later when sanity check the triton config.

The simple fix here is to just not let XBLOCK to be larger than xnumel.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138730
Approved by: https://github.com/Chillee
ghstack dependencies: #136782
---
 test/inductor/test_fp8.py                    | 21 ++++++++++++++++++++
 torch/_inductor/runtime/triton_heuristics.py |  1 +
 2 files changed, 22 insertions(+)

diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index 3348a90bc909e..72211eee70e74 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -88,6 +88,27 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
 
 @instantiate_parametrized_tests
 class TestFP8Types(TestCase):
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(TEST_WITH_ROCM, "Not supported yet")
+    @parametrize("float8_dtype", (torch.float8_e4m3fn, torch.float8_e5m2))
+    def test_xblock_for_small_numel(self, float8_dtype: torch.dtype):
+        """
+        TritonOverrides.to_dtype will set min_elem_per_thread to 2 or 4
+        depends on the variant of fp8 type.
+        This cause triton_heuristics.triton_config pick a XBLOCK larger
+        than numel and fail the config sanity check.
+
+        We should not pick a XBLOCK larger than xnumel
+        """
+
+        def f(x):
+            return x.to(dtype=float8_dtype)
+
+        x = torch.randn(1, device="cuda")
+        expected = f(x)
+        actual = torch.compile(f)(x)
+        torch.testing.assert_close(expected.half(), actual.half(), rtol=1e-2, atol=1e-2)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(TEST_WITH_ROCM, "Not supported yet")
     @parametrize("dtype", (torch.float16, torch.bfloat16))
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 18c3f66175e9b..6a55df5fe3944 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -1364,6 +1364,7 @@ def triton_config(
     x *= math.ceil(block_size / conditional_product(x, y, z))
 
     x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
+    x = min(x, size_hints[0])
 
     cfg = {"XBLOCK": x}
     if y:

From 6aa673377bb0c2dc1c9cf76d2e6d15ef722d835a Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 22 Oct 2024 10:23:21 -0700
Subject: [PATCH 029/161] [PyTorch] Fix inductor CPU masked() body codegen when
 result dtype is bool and operator is where (#138486)

In this case, it looks like we expect the body to be a VecMask (unify_mask_base_type is called by where()), but we didn't make it a VecMask. Now we do.

Differential Revision: [D64702918](https://our.internmc.facebook.com/intern/diff/D64702918/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138486
Approved by: https://github.com/leslie-fang-intel, https://github.com/malfet
---
 test/inductor/test_cpu_repro.py | 14 ++++++++++++++
 torch/_inductor/codegen/cpp.py  | 23 +++++++++++++----------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 72668f084f4e2..12b2ef7a3465c 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -4399,6 +4399,20 @@ def func2(arg0, arg1):
             ):
                 check_use_full_bits(func, shapes, dtype, mixed, check_vecn)
 
+    @config.patch("cpp.simdlen", 256)
+    @requires_vectorization
+    def test_avx2_bool_constant_pad_nd(self):
+        # NOTE: I tried using (0, 12, 12) and removing the cpp.simdlen=256 override, but
+        # that didn't repro the issue.
+        result = torch.testing.make_tensor(
+            (0, 6, 6), dtype=torch.bool, device=torch.device("cpu")
+        )
+
+        def fn(arg):
+            return torch.constant_pad_nd(arg, (1, 1, 1, 1, 1, 1))
+
+        self.common(fn, (result,))
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index acdb05c9ee8a5..64a175de63997 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1487,18 +1487,21 @@ def masked(mask, body, other):
 
         dtype = result.dtype
         body_code = f"{var}()"
-        body_code_vec = (
-            body_code
-            if result.is_vec
-            else f"{V.kernel._get_vec_type(dtype)}({body_code})"
-        )
+
+        def maskify_or_vecify(code):
+            return (
+                f"{V.kernel._get_mask_type()}::from({code})"
+                if dtype == torch.bool
+                else f"{V.kernel._get_vec_type(dtype)}({code})"
+            )
+
+        if result.is_vec:
+            body_code_vec = body_code
+        else:
+            body_code_vec = maskify_or_vecify(body_code)
         other_code = value_to_cpp(other, DTYPE_TO_CPP[dtype])
         # loading bool as VecMask<float, N>
-        other_code_vec = (
-            f"{V.kernel._get_mask_type()}::from({other_code})"
-            if dtype == torch.bool
-            else f"{V.kernel._get_vec_type(dtype)}({other_code})"
-        )
+        other_code_vec = maskify_or_vecify(other_code)
         assert isinstance(new_mask, CppCSEVariable), new_mask
         if new_mask.is_vec:
             code = BracesBuffer()

From ead5738ff24724de427e4513954d76ee9c426cee Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 22 Oct 2024 10:23:22 -0700
Subject: [PATCH 030/161] [PyTorch] Fix inductor bug with unrolled vectorized
 prod (#138542)

This issue is one of two inductor bugs blocking land of #137426. Turned out to be simple

Differential Revision: [D64734116](https://our.internmc.facebook.com/intern/diff/D64734116/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138542
Approved by: https://github.com/jgong5, https://github.com/malfet
ghstack dependencies: #138486

Co-authored-by: leslie-fang-intel <leslie.fang@intel.com>
---
 aten/src/ATen/cpu/vec/vec_mask.h |  1 +
 test/inductor/test_cpu_repro.py  | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h
index a39ffa3090b8e..c547e5911ecbd 100644
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@@ -279,6 +279,7 @@ VEC_MASK_DEFINE_UNARY_OP_GLOBAL(operator~)
 VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator&)
 VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator|)
 VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator^)
+VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator*)
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>, a & ~b)
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b)
 VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b))
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 12b2ef7a3465c..b8ed2c6644a39 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -4413,6 +4413,20 @@ def fn(arg):
 
         self.common(fn, (result,))
 
+    @config.patch(unroll_reductions_threshold=9999)
+    @requires_vectorization
+    def test_unrolled_bool_prod_vectorized(self):
+        result = torch.zeros((37, 37, 37), dtype=torch.bool)
+        dim_select = [0, 1]
+        result.narrow(dim_select[0], 0, 1).narrow(dim_select[1], 1, 1).zero_()
+        result.narrow(dim_select[0], 2, 1).narrow(dim_select[1], 3, 1).zero_()
+        result.narrow(dim_select[0], 4, 1).narrow(dim_select[1], 3, 1).zero_()
+
+        def fn(arg):
+            return torch.prod(arg, 1, dtype=torch.bool)
+
+        self.common(fn, (result,))
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests

From cbafe1e7f3f9f8af1c9351c7614982c6404a6cf1 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Tue, 22 Oct 2024 15:56:44 -0700
Subject: [PATCH 031/161] [PyTorch] Unbreak VectorizedN fmadd/fmsub/clamp
 (#138655)

These are ternary ops, not binary ops.

Differential Revision: [D64794253](https://our.internmc.facebook.com/intern/diff/D64794253/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138655
Approved by: https://github.com/jgong5, https://github.com/malfet
ghstack dependencies: #138486, #138542
---
 aten/src/ATen/cpu/vec/vec_n.h             | 53 ++++++++++++++++++++---
 aten/src/ATen/test/vec_test_all_types.cpp | 31 ++++++++++++-
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec_n.h b/aten/src/ATen/cpu/vec/vec_n.h
index 8c4e622682a28..ec17ab0e45e51 100644
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@@ -77,6 +77,21 @@ class VectorizedN {
     return result;
   }
 
+  template <typename Op>
+  inline VectorizedN<T, N> ternary_op(
+      const VectorizedN<T, N>& other,
+      const VectorizedN<T, N>& other2,
+      Op op) const {
+    VectorizedN<T, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = op(values[i], other.values[i], other2.values[i]);
+    }
+    return result;
+  }
+
   VectorizedN() = default;
 
   explicit VectorizedN(T val) {
@@ -89,7 +104,8 @@ class VectorizedN {
   VectorizedN(const Vectorized<T>& val) : values({val}) {}
 
   template <int L = N, typename std::enable_if_t<L == 2, int> = 0>
-  VectorizedN(const Vectorized<T>& val_0, const Vectorized<T>& val_1) : values({val_0, val_1}) {}
+  VectorizedN(const Vectorized<T>& val_0, const Vectorized<T>& val_1)
+      : values({val_0, val_1}) {}
 
   template <int L = N, typename std::enable_if_t<L == 1, int> = 0>
   inline operator Vectorized<T>() const {
@@ -110,7 +126,8 @@ class VectorizedN {
       const VectorizedN<T, N>& b) {
     VectorizedN<T, N> result;
     for (int i = 0; i < N; ++i) {
-      result.values[i] = Vectorized<T>::template blend<mask>(a.values[i], b.values[i]);
+      result.values[i] =
+          Vectorized<T>::template blend<mask>(a.values[i], b.values[i]);
     }
     return result;
   }
@@ -306,6 +323,20 @@ class VectorizedN {
     });                                                                        \
   }
 
+#define VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(op)             \
+  template <typename T, int N>                               \
+  inline VectorizedN<T, N> op(                               \
+      const VectorizedN<T, N>& a,                            \
+      const VectorizedN<T, N>& b,                            \
+      const VectorizedN<T, N>& c) {                          \
+    return a.ternary_op(                                     \
+        b,                                                   \
+        c,                                                   \
+        [](const Vectorized<T>& a,                           \
+           const Vectorized<T>& b,                           \
+           const Vectorized<T>& c) { return op(a, b, c); }); \
+  }
+
 #define VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(op)                     \
   template <typename T, int N>                                              \
   inline VectorizedN<T, N>& op(                                             \
@@ -326,9 +357,9 @@ VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator<<)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator>>)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(maximum)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(minimum)
-VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmadd)
-VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmsub)
-VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp)
+VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(fmadd)
+VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(fmsub)
+VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(clamp)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_max)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_min)
 VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator&)
@@ -357,5 +388,17 @@ inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN<T, N> acc_vec) {
   return vec_reduce_all(vec_fun, vec_result);
 }
 
+template <typename T, int N>
+std::ostream& operator<<(std::ostream& stream, const VectorizedN<T, N>& vec_n) {
+  stream << "vec_n[";
+  for (int i = 0; i < N; ++i) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << vec_n[i];
+  }
+  stream << ']';
+  return stream;
+}
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index 834788f57d98a..3791ebc357bc4 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -821,6 +821,17 @@ namespace {
             createDefaultTernaryTestCase<vec>(TestSeed()),
                 RESOLVE_OVERLOAD(filter_clamp));
     }
+    TYPED_TEST(MinMax, ClampVecN) {
+        using VT = ValueType<TypeParam>;
+        using vec = at::vec::VectorizedN<VT, 1>;
+        test_ternary<vec>(
+            NAME_INFO(clamp), clamp<VT>,
+            [](const vec& v0, const vec& v1, const vec& v2) {
+                return clamp(v0, v1, v2);
+            },
+            createDefaultTernaryTestCase<vec>(TestSeed()),
+                RESOLVE_OVERLOAD(filter_clamp));
+    }
     TYPED_TEST(BitwiseFloatsAdditional, ZeroMask) {
         using vec = TypeParam;
         using VT = ValueType<TypeParam>;
@@ -895,7 +906,25 @@ namespace {
           .setTestSeed(TestSeed());
 
         test_ternary<vec>(
-            NAME_INFO(clamp), RESOLVE_OVERLOAD(local_fmadd),
+            NAME_INFO(fmadd), RESOLVE_OVERLOAD(local_fmadd),
+            [](const vec& v0, const vec& v1, const vec& v2) {
+                return at::vec::fmadd(v0, v1, v2);
+            },
+            test_case,
+            RESOLVE_OVERLOAD(filter_fmadd));
+    }
+    TYPED_TEST(BitwiseFloatsAdditional, FmaddVecN) {
+        using VT = ValueType<TypeParam>;
+        using vec = at::vec::VectorizedN<VT, 1>;
+
+        auto test_case = TestingCase<vec>::getBuilder()
+          .addDomain(CheckWithinDomains<VT>{
+              {{(VT)-1000, (VT)1000}, {(VT)-1000, (VT)1000}, {(VT)-1000, (VT)1000}},
+              true, getDefaultTolerance<VT>()})
+          .setTestSeed(TestSeed());
+
+        test_ternary<vec>(
+            NAME_INFO(fmadd), RESOLVE_OVERLOAD(local_fmadd),
             [](const vec& v0, const vec& v1, const vec& v2) {
                 return at::vec::fmadd(v0, v1, v2);
             },

From 0af7632c10c61b3c345254fc192c84c5eeb551a9 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 23 Oct 2024 10:33:52 -0700
Subject: [PATCH 032/161] [PyTorch] Fix ASAN failures for vec_test_all_types
 Cast test (#138716)

The size of the destination array was too small.

Differential Revision: [D64843491](https://our.internmc.facebook.com/intern/diff/D64843491/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138716
Approved by: https://github.com/jgong5, https://github.com/malfet
ghstack dependencies: #138486, #138542, #138655
---
 aten/src/ATen/test/vec_test_all_types.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index 3791ebc357bc4..e2b64013a77d5 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1868,13 +1868,13 @@ namespace {
 
     #define TEST_MASK_CAST(dst_t, mask_t, mask_n)                      \
       do {                                                             \
-        CACHE_ALIGN mask_t x[mask_n * size];                           \
-        CACHE_ALIGN dst_t y[mask_n * size];                            \
-        auto seed = TestSeed();                                        \
-        auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);       \
         constexpr int num_dst_elements =                               \
             std::min(size, at::vec::Vectorized<dst_t>::size());        \
         constexpr int dst_n = mask_n * size / num_dst_elements;        \
+        CACHE_ALIGN mask_t x[mask_n * size];                           \
+        CACHE_ALIGN dst_t y[at::vec::VectorizedN<dst_t, dst_n>::size()]; \
+        auto seed = TestSeed();                                        \
+        auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);       \
         auto vec_mask_new = vec_mask.template cast<dst_t, dst_n>();    \
         vec_mask.template to<mask_t, mask_n>().store(x);               \
         vec_mask_new.template to<dst_t, dst_n>().store(y);             \

From 10a34dcd57ac0b1a2a9005fcc1c6342ed3187545 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 23 Oct 2024 13:48:32 -0700
Subject: [PATCH 033/161] [PyTorch] Fix out-of-bounds array access in
 atomic_add_vec (#138744)

There is no guarantee that `len` here is enough for a full vector. This was causing at least one test failure on https://github.com/pytorch/pytorch/pull/137426.

Differential Revision: [D64857786](https://our.internmc.facebook.com/intern/diff/D64857786/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138744
Approved by: https://github.com/jgong5, https://github.com/malfet
ghstack dependencies: #138486, #138542, #138655, #138716
---
 torch/_inductor/codegen/cpp_prefix.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 68aae9812082b..0ae57c7d4c649 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -637,8 +637,8 @@ void atomic_add_vec(T *addr, at::vec::VectorizedN<int64_t, NI> index, at::vec::V
   static_assert(len <= at::vec::VectorizedN<T, NV>::size());
   __at_align__ std::array<T, len> tmpbuf;
   __at_align__ std::array<int64_t, len> tmpidx;
-  offset.store(tmpbuf.data());
-  index.store(tmpidx.data());
+  offset.store(tmpbuf.data(), len);
+  index.store(tmpidx.data(), len);
   for (int i = 0; i < len; i++){
     atomic_add(addr + tmpidx[i], tmpbuf[i]);
   }

From 5b50b0a9bcd51b8513603ad91aabbe6d0272639b Mon Sep 17 00:00:00 2001
From: ernest-lu <53326307+ernest-lu@users.noreply.github.com>
Date: Thu, 24 Oct 2024 20:29:19 +0000
Subject: [PATCH 034/161] remove dead code (#138690)

Fixes issue-138673: [issue](https://github.com/pytorch/pytorch/issues/138673)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138690
Approved by: https://github.com/Aidyn-A, https://github.com/colesbury
---
 test/test_proxy_tensor.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 496dfbccea35a..f31f85f12e219 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1519,11 +1519,6 @@ def f(x1, x2, x3, y):
             z3 = x3.item()
             torch._check(z1 == z2 + z3)
             return y * 2
-            if z2 + z3 == z1:
-                return y * 2
-            else:
-                return y + 3
-
         # NB: inputs are done as CUDA to ensure they aren't queried to be
         # backed
 

From 500b2bc78116bd24734d289fe3884d4b8e7bd380 Mon Sep 17 00:00:00 2001
From: Bob Ren <bobren@fb.com>
Date: Tue, 22 Oct 2024 19:18:13 -0700
Subject: [PATCH 035/161] Have as_tensor always return a float64 tensor in
 dynamo (#138598)

As discussed with @ezyang, this set of diffs are extracting fixes to problems discovered to flipping `specialize_float=False` in https://github.com/pytorch/pytorch/pull/137782. Since these codepaths are exercised in existing tests, I'm going to bias towards shipping speed and put these up with the primary test plan as the global CI. These code paths are all tested via existing tests when `specialize_float=False` and it feels a bit wonky to add more gated tests that only test behavior when this flag is True, especially since these code paths are already covered. That being said, I'm happy to add individual tests if reviewers insist or have a different POV.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138598
Approved by: https://github.com/ezyang
ghstack dependencies: #138595
---
 test/dynamo/test_unspec.py        | 14 ++++++++++++++
 torch/_dynamo/codegen.py          |  4 +++-
 torch/_dynamo/variables/tensor.py |  4 ++--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index b4b4389dd3029..bd154f904fdf4 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -601,6 +601,20 @@ def fn(x):
         compl_fn = torch.compile(fn, dynamic=True, backend="eager")
         self.assertEqual(compl_fn(inputs), fn(inputs))
 
+    @torch._dynamo.config.patch(specialize_float=False)
+    def test_unspec_roundtrip_float_input(self):
+        def f(x, y):
+            if y == 5.0:
+                return x + 2
+            else:
+                return x + y
+            return (x, y)
+
+        cf = torch.compile(backend="eager", fullgraph=True)(f)
+        x = 1.1234567891234568
+        y = 1.1234567891234569
+        self.assertAlmostEqual(f(x, y), cf(x, y))
+
     @torch._dynamo.config.patch(specialize_float=False, assume_static_by_default=True)
     def test_unspec_float_input(self):
         cnts = torch._dynamo.testing.CompileCounter()
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 74d761c50e92f..dd83f56c34615 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -185,7 +185,9 @@ def __call__(self, value, allow_cache=True):
             # NB: It works to add_graph_output on a computed expression
             # as_tensor here, because we memoize as_tensor calls on
             # SymNodeVariable!
-            graph_outputs_key = self.add_graph_output(value.as_tensor(self.tx))
+            graph_outputs_key = self.add_graph_output(
+                value.as_tensor(self.tx, torch.float64)
+            )
 
             def gen_fn():
                 self.load_graph_output(graph_outputs[graph_outputs_key].index)
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index c782a19d7f1e4..bb64f30a458ac 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1146,11 +1146,11 @@ def python_type(self):
     def as_proxy(self):
         return self.proxy
 
-    def as_tensor(self, tx):
+    def as_tensor(self, tx, dtype):
         if self._tensor_var is None:
             self._tensor_var = VariableTracker.build(
                 tx, torch.scalar_tensor
-            ).call_function(tx, [self], {})
+            ).call_function(tx, [self], {"dtype": VariableTracker.build(tx, dtype)})
         return self._tensor_var
 
     def evaluate_expr(self, output_graph=None):

From e78a3e260b38d85bc2f64a26acfa47222b87fb70 Mon Sep 17 00:00:00 2001
From: Yiming Zhou <yimingzhou@meta.com>
Date: Thu, 24 Oct 2024 21:35:32 +0000
Subject: [PATCH 036/161] [export] Add serdes_non_strict to tests (#138662)

Summary: We expand the tests to cover serdes_non_strict. Currently failing tests are skipped.

Test Plan:
```
buck2 test @//mode/dev-nosan //caffe2/test:test_export -- -r _serdes_non_strict
```

Differential Revision: D64709285

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138662
Approved by: https://github.com/avikchaudhuri
---
 test/export/test_export.py | 16 ++++++++++++++-
 test/export/test_serdes.py | 42 ++++++++++++++++++++++++++++----------
 test/export/testing.py     |  6 ++++++
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/test/export/test_export.py b/test/export/test_export.py
index c3c6f1f231e36..f6cc5554666b3 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -168,6 +168,7 @@ class Inp:
 RETRACEABILITY_STRICT_SUFFIX = "_retraceability"
 RETRACEABILITY_NON_STRICT_SUFFIX = "_retraceability_non_strict"
 SERDES_SUFFIX = "_serdes"
+SERDES_NON_STRICT_SUFFIX = "_serdes_non_strict"
 PREDISPATCH_SUFFIX = "_pre_dispatch"
 TRAINING_IR_DECOMP_STRICT_SUFFIX = "_training_ir_to_decomp"
 TRAINING_IR_DECOMP_NON_STRICT_SUFFIX = "_training_ir_to_decomp_non_strict"
@@ -184,7 +185,9 @@ def is_retracebility_test(test_name):
 
 
 def is_serdes_test(test_name):
-    return test_name.endswith(SERDES_SUFFIX)
+    return test_name.endswith(SERDES_SUFFIX) or test_name.endswith(
+        SERDES_NON_STRICT_SUFFIX
+    )
 
 
 def is_training_ir_test(test_name):
@@ -1011,6 +1014,7 @@ def forward(self, x, y):
         self.assertEqual(ep.module()(x, y), model(x, y))
 
     @testing.expectedFailureSerDer  # SymBool serialization? TODO(pianpwk)
+    @testing.expectedFailureSerDerNonStrict
     def test_real_tensor_bool_cast(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -1022,6 +1026,7 @@ def forward(self, x):
             ep = export(model, inputs, strict=False)
 
     @testing.expectedFailureSerDer
+    @testing.expectedFailureSerDerNonStrict
     def test_is_nonzero(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -1175,6 +1180,7 @@ def forward(self, x, weight, bias):
         self.assertEqual(actual_result, expected_result)
 
     @testing.expectedFailureSerDer  # failed serializing SymInt nodes in subgraph (known issue)
+    @testing.expectedFailureSerDerNonStrict
     def test_hoo_inline_users_issue(self):
         # This came from an issue where replace_with_hop passes would inline subgraphs,
         # and mess up node.users for nodes present in multiple subgraphs (e.g. _x in SetGradCase
@@ -2620,6 +2626,7 @@ def forward(self, x):
         export(N(), inputs, dynamic_shapes=dynamic_shapes)
 
     @testing.expectedFailureSerDer  # no unbacked bindings after deserialization?
+    @testing.expectedFailureSerDerNonStrict
     def test_unbacked_bindings_for_divisible_u_symint(self):
         with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
             torch.library.define(
@@ -2820,6 +2827,7 @@ def forward(self, t):
         export(N(), (t,), strict=strict)
 
     @testing.expectedFailureSerDer  # T195866111
+    @testing.expectedFailureSerDerNonStrict
     def test_suggested_fixes_for_data_dependent_errors_puzzlers(self):
         # suggested fixes for data-dependent errors only work in non-strict mode
         strict = False
@@ -3101,6 +3109,7 @@ def forward(self, x, y):
         ep.module()(torch.randn(6, 3), torch.randn(7, 4))
 
     @testing.expectedFailureRetraceability  # T183144629
+    @testing.expectedFailureSerDerNonStrict
     def test_map(self):
         class Module(torch.nn.Module):
             def forward(self, xs, y, z):
@@ -3330,6 +3339,7 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
         self._test_export_same_as_eager(kw_func, args, kwargs)
 
     @testing.expectedFailureSerDer  # we don't save placeholder metadata
+    @testing.expectedFailureSerDerNonStrict
     @testing.expectedFailureNonStrict
     @testing.expectedFailureTrainingIRToRunDecompNonStrict  # source_fn_stack failure
     @testing.expectedFailureRetraceabilityNonStrict
@@ -6155,6 +6165,7 @@ def forward(self, q, k, v):
         self.assertEqual(ep.module()(*inputs), m(*inputs))
 
     @testing.expectedFailureSerDer  # symfloat nyi
+    @testing.expectedFailureSerDerNonStrict
     def test_sym_sqrt(self):
         import math
 
@@ -7740,6 +7751,7 @@ def forward(self, w, x, y, z):
 
     # TODO requires_grad doesn't seem to work with serialization.
     @testing.expectedFailureSerDer
+    @testing.expectedFailureSerDerNonStrict
     def test_preserve_requires_grad_placeholders(self):
         class Module(torch.nn.Module):
             def __init__(self) -> None:
@@ -8632,6 +8644,7 @@ def test_dynamic_shapes_serdes_user_errors(self):
             _load_dynamic_shapes(spec, from_dict=True)
 
     @testing.expectedFailureSerDer  # TODO(pianpwk): PowByNatural valuerange deserialization
+    @testing.expectedFailureSerDerNonStrict
     @testing.expectedFailureRetraceabilityNonStrict
     def test_dim_dynamic(self):
         dynamic = Dim.DYNAMIC
@@ -8711,6 +8724,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     @testing.expectedFailureNonStrict
     @testing.expectedFailureTrainingIRToRunDecompNonStrict  # unbacked symint not tracked?
     @testing.expectedFailureSerDer  # T195866111
+    @testing.expectedFailureSerDerNonStrict
     @testing.expectedFailureRetraceabilityNonStrict
     def test_hints_wrapper(self):
         class M(torch.nn.Module):
diff --git a/test/export/test_serdes.py b/test/export/test_serdes.py
index a1ced9dd4e5e6..d22d19500f3ae 100644
--- a/test/export/test_serdes.py
+++ b/test/export/test_serdes.py
@@ -15,7 +15,7 @@
 test_classes = {}
 
 
-def mocked_serder_export(*args, **kwargs):
+def mocked_serder_export_strict(*args, **kwargs):
     ep = export(*args, **kwargs)
     buffer = io.BytesIO()
     save(ep, buffer)
@@ -24,16 +24,35 @@ def mocked_serder_export(*args, **kwargs):
     return loaded_ep
 
 
-def make_dynamic_cls(cls):
-    cls_prefix = "SerDesExport"
+def mocked_serder_export_non_strict(*args, **kwargs):
+    if "strict" in kwargs:
+        ep = export(*args, **kwargs)
+    else:
+        ep = export(*args, **kwargs, strict=False)
+    buffer = io.BytesIO()
+    save(ep, buffer)
+    buffer.seek(0)
+    loaded_ep = load(buffer)
+    return loaded_ep
+
 
-    test_class = testing.make_test_cls_with_mocked_export(
-        cls,
-        cls_prefix,
-        test_export.SERDES_SUFFIX,
-        mocked_serder_export,
-        xfail_prop="_expected_failure_serdes",
-    )
+def make_dynamic_cls(cls, strict):
+    if strict:
+        test_class = testing.make_test_cls_with_mocked_export(
+            cls,
+            "SerDesExport",
+            test_export.SERDES_SUFFIX,
+            mocked_serder_export_strict,
+            xfail_prop="_expected_failure_serdes",
+        )
+    else:
+        test_class = testing.make_test_cls_with_mocked_export(
+            cls,
+            "SerDesExportNonStrict",
+            test_export.SERDES_NON_STRICT_SUFFIX,
+            mocked_serder_export_non_strict,
+            xfail_prop="_expected_failure_serdes_non_strict",
+        )
 
     test_classes[test_class.__name__] = test_class
     # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
@@ -46,7 +65,8 @@ def make_dynamic_cls(cls):
     test_export.TestExport,
 ]
 for test in tests:
-    make_dynamic_cls(test)
+    make_dynamic_cls(test, True)
+    make_dynamic_cls(test, False)
 del test
 
 if __name__ == "__main__":
diff --git a/test/export/testing.py b/test/export/testing.py
index 6d5729fd6b77a..ed72f219eb639 100644
--- a/test/export/testing.py
+++ b/test/export/testing.py
@@ -270,6 +270,12 @@ def expectedFailureSerDer(fn):
     return fn
 
 
+# Controls tests generated in test/export/test_serdes.py
+def expectedFailureSerDerNonStrict(fn):
+    fn._expected_failure_serdes_non_strict = True
+    return fn
+
+
 def expectedFailureSerDerPreDispatch(fn):
     fn._expected_failure_serdes_pre_dispatch = True
     return fn

From 282e6383c160c194ebd7fbe17512313f5b42be3f Mon Sep 17 00:00:00 2001
From: James Wu <jjwu@meta.com>
Date: Wed, 23 Oct 2024 09:52:14 -0700
Subject: [PATCH 037/161] Add inductor cache metrics (#138603)

Each inductor event should have exactly one hit, miss, bypass etc. Add it to the inductor compile event.

Add triton_compile as a compiler phase with `dynamo_timed`. This way, we get PT2 Compile Event Logs for triton as well.

Here's what triton events look like:  {F1941513932}
And this on a cache hit(since we still redo this work):
 {F1941514350}

Inductor cache info:
 {F1941528530}

Differential Revision: [D64703392](https://our.internmc.facebook.com/intern/diff/D64703392/)

@diff-train-skip-merge

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138603
Approved by: https://github.com/oulgen
---
 torch/_inductor/codecache.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 738669eec93c7..c914c6a7338bd 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1512,6 +1512,18 @@ def load(  # type: ignore[no-untyped-def]
             cache_info["cache_event_time"],
             metadata=cache_info,
         )
+        # Add event data about cache hits/miss
+        # TODO: add remote cache get/put timings here too
+        chromium_log.add_event_data(
+            "inductor_compile",
+            cache_state=cache_state,
+            cache_event_time=cache_info["cache_event_time"],
+            key=cache_info.get("key"),
+            components=cache_info.get("components"),
+            cache_bypass_reason=cache_info.get("cache_bypass_reason"),
+            remote_cache_enabled=remote,
+            local_cache_enabled=local,
+        )
         torch._logging.trace_structured(
             "artifact",
             metadata_fn=lambda: {

From 51f6b946ae553def9c7da880222fc9f17d44a302 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Thu, 24 Oct 2024 22:14:55 +0000
Subject: [PATCH 038/161] [torchbind] Add generic __deepcopy__ method (#137613)

Summary: Added a generic `__deepcopy__` method which will use the torchbind object's existing `__getattr__` and `__setattr__` to copy the torchbind object. This will later be used in [D64124825](https://www.internalfb.com/diff/D64124825)

Differential Revision: D64124826

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137613
Approved by: https://github.com/ydwu4, https://github.com/zou3519
---
 .../jit/test_custom_class_registrations.cpp   | 50 ++++++++++++++-----
 test/export/test_torchbind.py                 | 26 +++++++++-
 torch/csrc/jit/python/script_init.cpp         | 47 +++++++++++++++++
 3 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index c3448a46cdf0a..d1e0d5fa2180b 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -145,17 +145,39 @@ struct TensorQueue : torch::CustomClassHolder {
     }
   }
 
-  c10::Dict<std::string, at::Tensor> serialize() const {
-    c10::Dict<std::string, at::Tensor> dict;
-    dict.insert(std::string("init_tensor"), init_tensor_);
-    const std::string key = "queue";
-    dict.insert(
-        key + "/size", torch::tensor(static_cast<int64_t>(queue_.size())));
-    for (const auto index : c10::irange(queue_.size())) {
-      dict.insert(key + "/" + std::to_string(index), queue_[index]);
+  std::tuple<
+      std::tuple<std::string, at::Tensor>,
+      std::tuple<std::string, std::vector<at::Tensor>>>
+  serialize() {
+    return std::tuple(
+        std::tuple("init_tensor", this->init_tensor_.clone()),
+        std::tuple("queue", this->clone_queue()));
+  }
+
+  static c10::intrusive_ptr<TensorQueue> deserialize(
+      std::tuple<
+          std::tuple<std::string, at::Tensor>,
+          std::tuple<std::string, std::vector<at::Tensor>>> flattened) {
+    TORCH_CHECK(std::tuple_size<decltype(flattened)>::value == 2);
+
+    auto init_tensor_tuple = std::get<0>(flattened);
+    TORCH_CHECK(std::tuple_size<decltype(init_tensor_tuple)>::value == 2);
+    TORCH_CHECK(std::get<0>(init_tensor_tuple) == std::string("init_tensor"));
+
+    c10::intrusive_ptr<TensorQueue> queue =
+        c10::make_intrusive<TensorQueue>(std::get<1>(init_tensor_tuple));
+
+    auto queue_tuple = std::get<1>(flattened);
+    TORCH_CHECK(std::tuple_size<decltype(queue_tuple)>::value == 2);
+    TORCH_CHECK(std::get<0>(queue_tuple) == std::string("queue"));
+
+    for (auto& value : std::get<1>(queue_tuple)) {
+      queue->push(value);
     }
-    return dict;
+
+    return queue;
   }
+
   // Push the element to the rear of queue.
   // Lock is added for thread safe.
   void push(at::Tensor x) {
@@ -639,13 +661,17 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
       .def_pickle(
           // __getstate__
           [](const c10::intrusive_ptr<TensorQueue>& self)
-              -> c10::Dict<std::string, at::Tensor> {
+              -> std::tuple<
+                  std::tuple<std::string, at::Tensor>,
+                  std::tuple<std::string, std::vector<at::Tensor>>> {
             return self->serialize();
           },
           // __setstate__
-          [](c10::Dict<std::string, at::Tensor> data)
+          [](std::tuple<
+              std::tuple<std::string, at::Tensor>,
+              std::tuple<std::string, std::vector<at::Tensor>>> data)
               -> c10::intrusive_ptr<TensorQueue> {
-            return c10::make_intrusive<TensorQueue>(std::move(data));
+            return TensorQueue::deserialize(data);
           });
 }
 
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index 9b2f1546f1a78..997aeecd37dda 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -1,6 +1,6 @@
 # Owner(s): ["oncall: export"]
 
-
+import copy
 import unittest
 
 import torch
@@ -1028,6 +1028,30 @@ def forward(self, token, tq, x):
     return (tq,)""",  # noqa: B950
         )
 
+    def test_deepcopy(self):
+        tq = torch.classes._TorchScriptTesting._TensorQueue(
+            torch.empty(
+                0,
+            ).fill_(-1)
+        )
+        tq_0 = copy.deepcopy(tq)
+        tq.push(torch.zeros(2, 2))
+        tq.push(torch.ones(2, 2))
+        tq_1 = copy.deepcopy(tq)
+        tq.push(torch.ones(2, 2) * 2)
+        self.assertEqual(tq_0.size(), 0)
+        self.assertEqual(tq_1.size(), 2)
+        self.assertEqual(tq.size(), 3)
+
+        foo = torch.classes._TorchScriptTesting._Foo(1, 2)
+        foo_0 = copy.deepcopy(foo)
+        foo.increment(1)
+        foo_1 = copy.deepcopy(foo)
+        foo.increment(1)
+        self.assertEqual(foo_0.add(1), 3)
+        self.assertEqual(foo_1.add(1), 5)
+        self.assertEqual(foo.add(1), 7)
+
 
 class TestCompileTorchbind(TestCase):
     def setUp(self):
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 2eb9a6f021770..690859f0a0a2a 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -866,6 +866,53 @@ void initJitScriptBindings(PyObject* module) {
                 // Similar to Tensor's `__hash__`, which is `id()`.
                 return std::hash<c10::ivalue::Object*>{}(self._ivalue().get());
               })
+          .def(
+              "__deepcopy__",
+              [](const Object& self, const py::dict& memo) {
+                if (auto getstate_method = self.find_method("__getstate__")) {
+                  auto object_state = toPyObject((*getstate_method)(Stack{}));
+
+                  if (auto qualname = self.type()->name()) {
+                    auto class_type = getCustomClass(qualname->qualifiedName());
+                    auto self = Object(c10::ivalue::Object::create(
+                        c10::StrongTypePtr(
+                            std::shared_ptr<torch::jit::CompilationUnit>(),
+                            class_type),
+                        1));
+
+                    if (auto setstate_method =
+                            self.find_method("__setstate__")) {
+                      auto setstate_schema =
+                          setstate_method->function().getSchema();
+                      TORCH_INTERNAL_ASSERT(
+                          setstate_schema.arguments().size() == 2,
+                          "__setstate__ method for class ",
+                          class_type->repr_str(),
+                          " must have exactly 2 arguments!");
+                      auto state_type =
+                          setstate_schema.arguments().at(1).type();
+                      (*setstate_method)(
+                          Stack{toIValue(object_state, state_type)});
+                      return self;
+                    }
+                    std::stringstream err;
+                    err << "Tried to deepcopy object ";
+                    if (auto qualname = class_type->name()) {
+                      err << qualname->qualifiedName() << " ";
+                    }
+                    err << "which does not have a __setstate__ method defined!";
+                    throw std::runtime_error(err.str());
+                  }
+                }
+
+                std::stringstream err;
+                err << "Tried to deepcopy object ";
+                if (auto qualname = self.type()->name()) {
+                  err << qualname->qualifiedName() << " ";
+                }
+                err << "which does not have a __getstate__ method defined!";
+                throw std::runtime_error(err.str());
+              })
           .def(py::pickle(
               [](const Object& self)
                   -> std::tuple<py::object, std::string> { // __getstate__

From 09848c892a8bc1920c27ecff0f5a8ef39e39b357 Mon Sep 17 00:00:00 2001
From: Pian Pawakapan <pianpwk@meta.com>
Date: Thu, 24 Oct 2024 22:22:14 +0000
Subject: [PATCH 039/161] [aot_compile] propagate ShapeEnv during lowering
 (#138362)

We found that `export() -> _inductor.aot_compile()` lowering, 3 different ShapeEnvs get created, leading to errors when one ShapeEnv processes expressions created by another ShapeEnv. This plumbs the 2 places where ShapeEnv creation happens, detecting the original ShapeEnv from the GraphModule example values, so the original ShapeEnv is just reused.

Differential Revision: D64613290

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138362
Approved by: https://github.com/angelayi
---
 .../aot_inductor_torchbench_inference.csv     |  2 +-
 .../aot_inductor_torchbench_inference.csv     |  2 +-
 test/inductor/test_aot_inductor.py            | 20 +++++++++++++++++++
 torch/_functorch/aot_autograd.py              |  6 +++++-
 torch/_inductor/compile_fx.py                 |  5 +++--
 5 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 25d69931bb6c1..1b9b034987947 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -278,7 +278,7 @@ resnext50_32x4d,pass,0
 
 
 
-sam,fail_to_run,0
+sam,pass,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
index 1624d6dc7973f..1934304128888 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
@@ -282,7 +282,7 @@ resnext50_32x4d,pass,0
 
 
 
-sam,fail_to_run,0
+sam,pass,0
 
 
 
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 487fa3d344cbe..14c48a4e7e94e 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import copy
 import itertools
+import logging
 import os
 import sys
 import tempfile
@@ -41,6 +42,7 @@
     skipIfRocm,
     TEST_WITH_ROCM,
 )
+from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
 from torch.utils import _pytree as pytree
 
@@ -3695,6 +3697,24 @@ def forward(self, x):
         self.check_model(Model(), example_inputs)
 
 
+class AOTInductorLoggingTest(LoggingTestCase):
+    @make_logging_test(dynamic=logging.DEBUG)
+    def test_shape_env_reuse(self, records):
+        # make sure ShapeEnv is only created once and reused afterwards
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + 2
+
+        inputs = (torch.randn(4, 4),)
+        dynamic_shapes = {
+            "x": {0: Dim.AUTO, 1: Dim.AUTO},
+        }
+        ep = export(Foo(), inputs, dynamic_shapes=dynamic_shapes, strict=False)
+        with torch.no_grad():
+            torch._inductor.aot_compile(ep.module(), inputs)
+        self.assertEqual([r.msg == "create_env" for r in records].count(True), 1)
+
+
 common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
 
 
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 87c49887dea15..a284689c0e72c 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -1454,6 +1454,7 @@ def _aot_export_function(
     flat_fn, out_spec = create_tree_flattened_fn(func, args, kwargs)
     flat_args, in_spec = pytree.tree_flatten((args, kwargs))
 
+    fake_mode = None
     if dynamic_shapes is None:
         # Try to infer `dynamic_shapes from inputs and graph nodes
         fake_mode = detect_fake_mode(flat_args)
@@ -1491,7 +1492,10 @@ def _aot_export_function(
         no_tangents=no_tangents,
         pre_dispatch=pre_dispatch,
     )
-    fake_mode, shape_env = construct_fake_mode(flat_args, aot_config)
+    if fake_mode is None:
+        fake_mode, shape_env = construct_fake_mode(flat_args, aot_config)
+    else:
+        shape_env = fake_mode.shape_env
     fake_flat_args = process_inputs(flat_args, aot_config, fake_mode, shape_env)
 
     fx_g, meta = create_aot_dispatcher_function(
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 1320cbc30a054..2a167231b5e5d 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -1391,6 +1391,7 @@ def compile_fx(
                 }
             ), V.set_real_inputs(example_inputs_):
                 inputs_: Sequence[InputType] = example_inputs_
+
                 if isinstance(model_, GraphModule):
                     fake_inputs = [
                         node.meta.get("val")
@@ -1404,7 +1405,7 @@ def compile_fx(
                         for inp in fake_inputs
                     ]
 
-                    if all(v is not None for v in fake_inputs):
+                    if any(v is not None for v in fake_inputs):
                         # Validate devices before switching to fake tensors.
                         for idx, fi, i in zip(count(), fake_inputs, inputs_):
                             if fi is not None:
@@ -1415,7 +1416,7 @@ def compile_fx(
                                         f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
                                         "make sure torch.export() and torch.aot_compile() run on the same device."
                                     )
-                        inputs_ = fake_inputs
+                        inputs_ = fake_inputs  # type: ignore[assignment]
                 return compile_fx(
                     model_,
                     inputs_,

From 3e4ba18eb5d3d271cffb662467e2b637a998b3a2 Mon Sep 17 00:00:00 2001
From: Henry Tsang <henrylhtsang@meta.com>
Date: Thu, 24 Oct 2024 23:16:30 +0000
Subject: [PATCH 040/161] [aoti] fix typo in codegen_dynamic_scalar (#138760)

Summary: appears to be a typo

Test Plan: ci

Differential Revision: D64867271

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138760
Approved by: https://github.com/ezyang
---
 torch/_inductor/codegen/cpp_wrapper_cpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 3f395d8ec7221..891ed89ed8d66 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -1211,9 +1211,9 @@ def codegen_dynamic_scalar(self, node):
 
         if len(node.keypath) == 0:
             self.writeline(f"auto {node.sym} = {node.sym}_raw;")
-        elif len(node.keypath == 1) and isinstance(node.keypath[0], ConvertIntKey):
+        elif len(node.keypath) == 1 and isinstance(node.keypath[0], ConvertIntKey):
             self.writeline(f"int64_t {node.sym} = {node.sym}_raw ? 1 : 0;")
-        elif len(node.keypath == 1) and isinstance(node.keypath[0], DivideByKey):
+        elif len(node.keypath) == 1 and isinstance(node.keypath[0], DivideByKey):
             # TODO: assert divisibility here
             self.writeline(
                 f"int64_t {node.sym} = {node.sym}_raw / {node.keypath[0].divisor};"

From 751987eed1490de3e14308760f38ab664721e8ea Mon Sep 17 00:00:00 2001
From: Henry Tsang <henrylhtsang@meta.com>
Date: Thu, 24 Oct 2024 23:38:07 +0000
Subject: [PATCH 041/161] [pt2] improve error logs for torch.cond and aoti
 package (#138647)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138647
Approved by: https://github.com/ydwu4, https://github.com/angelayi
---
 torch/_higher_order_ops/cond.py | 7 +++++--
 torch/_inductor/__init__.py     | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 08d9dda49dd39..d9c4d0ed8b109 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -468,14 +468,17 @@ def cond_func(ctx, pred, true_fn, false_fn, inputs):
                 branch, unwrapped_inputs, pre_dispatch=pre_dispatch
             ):
                 raise UnsupportedAliasMutationException(
-                    "One of torch.cond branch might be modifying the input!"
+                    "One of torch.cond branch might be modifying the input! "
+                    "Consider cloning the input before modifying it. "
                 )
         for branch in [true_fn, false_fn]:
             if _has_potential_branch_input_alias(
                 branch, unwrapped_inputs, pre_dispatch=pre_dispatch
             ):
                 raise UnsupportedAliasMutationException(
-                    "One of torch.cond branch might be aliasing the input!"
+                    "One of torch.cond branch might be aliasing the input! "
+                    "If you are returning a view of the input, please make sure "
+                    "to clone it. "
                 )
 
         cond_return = cond_op(
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index 422ffc94990da..397739147c13b 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -79,7 +79,9 @@ def aoti_compile_and_package(
     if not isinstance(exported_program, ExportedProgram):
         raise ValueError("Only ExportedProgram is supported")
 
-    assert package_path is None or package_path.endswith(".pt2")
+    assert package_path is None or package_path.endswith(
+        ".pt2"
+    ), f"Expect package path to end with .pt2, got {package_path}"
 
     inductor_configs = inductor_configs or {}
 

From 425ce2a7eef412f5205bbd24afa84dcfa191ec1e Mon Sep 17 00:00:00 2001
From: Chirag Pandya <cpio@meta.com>
Date: Thu, 24 Oct 2024 23:42:29 +0000
Subject: [PATCH 042/161] [c10d] use a promise to delay watchdog shutdown
 (#138828)

Summary:
We always need to give the heartbeat monitor thread time to write out flight recorder dumps. Otherwise, the watchdog thread kills the heartbeat monitor thread too fast before it has time to write out the Flight Recorder logs.
This change:
1. Removes the "sleep after exception" JK. We don't need to sleep for 8 minutes.
2. Use a promise between watchdog thread and heartbeat monitor thread to delay, at most, one minute to give Flight Recorder time to write out it's log on timeout.

Test Plan:
Tested on my local job and flight recorder successfully executed for the job.
https://fburl.com/mlhub/38fj5yne
The watchdog thread gives heartbeat thread time to write out the logs.

In the logs we see:
```
[trainer4]:I1023 17:39:29.755507 12592 ProcessGroupNCCL.cpp:1950] [PG ID 0 PG GUID 0(precheck) Rank 12] slept for 1647ms giving time for flight recorder dumps to finish.
```

Differential Revision: D64857928

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138828
Approved by: https://github.com/d4l3k, https://github.com/fduwjj
---
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 21 ++++++++++++-------
 .../distributed/c10d/ProcessGroupNCCL.hpp     | 10 +++------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index fbd69dd7fd97f..78d050442ade5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -900,7 +900,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   // both timeout and other errors.
   dumpOnTimeoutOrEx_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
-  sleepAfterException_ = getCvarBool(TORCH_NCCL_SLEEP_AFTER_EXCEPTION, false);
   // logging C++ stack isn't safe. Introduce a variable to control it.
   logCppStackOnUncleanShutdown_ =
       getCvarBool(TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN, true);
@@ -1592,6 +1591,8 @@ void ProcessGroupNCCL::heartbeatMonitor() {
         "Flight recorder dump in heartbeatMonitor",
         false,
         true);
+    // Indicate to watchdog thread that we have finished dumping.
+    promiseFlightRecorderDump_.set_value();
   }
 
   // GIL deadlock check.
@@ -1947,12 +1948,18 @@ void ProcessGroupNCCL::watchdogHandler() {
             }
             // signal the monitor thread on PG0 to start dumping
             shouldDump_.store(true);
-            if (sleepAfterException_) {
-              // This sleep is used to give time for dumping before throwing
-              // exception
-              std::this_thread::sleep_for(
-                  std::chrono::seconds(heartbeatTimeoutInSec_));
-              LOG(INFO) << logPrefix() << "slept for " << heartbeatTimeoutInSec_
+            // Give time for dumping before throwing exception
+            auto start = std::chrono::steady_clock::now();
+            auto status = promiseFlightRecorderDump_.get_future().wait_for(
+                std::chrono::milliseconds(waitTimeoutDumpInMilSec_));
+            if (status == std::future_status::timeout) {
+              LOG(WARNING) << logPrefix() << "timed out after waiting for "
+                           << waitTimeoutDumpInMilSec_ << "ms"
+                           << " flight recorder dumps to finish.";
+            } else if (status == std::future_status::ready) {
+              auto end = std::chrono::steady_clock::now();
+              LOG(INFO) << logPrefix() << "slept for "
+                        << computeDeltaMS(start, end) << "ms"
                         << " giving time for flight recorder dumps to finish.";
             }
           } catch (const std::exception& e) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 9a4dd3f46a68d..263ec512c8c81 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -64,13 +64,6 @@ static std::vector<std::string> TORCH_NCCL_ASYNC_ERROR_HANDLING = {
 static std::vector<std::string> TORCH_NCCL_DUMP_ON_TIMEOUT = {
     "TORCH_NCCL_DUMP_ON_TIMEOUT"};
 
-// TODO: remove this change after a safe rollout.
-// Control whether we sleep after an exception is thrown.
-// This change is temporary and is used to safely remove the current sleep that
-// exists after an exception is thrown.
-static std::vector<std::string> TORCH_NCCL_SLEEP_AFTER_EXCEPTION = {
-    "TORCH_NCCL_SLEEP_AFTER_EXCEPTION"};
-
 // Control whether Desync Debug is enabled. This variable must be set
 // together with TORCH_NCCL_ASYNC_ERROR_HANDLING.
 static std::vector<std::string> TORCH_NCCL_DESYNC_DEBUG = {
@@ -1041,6 +1034,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // timeout for the dump to finish.
   int waitTimeoutDumpInMilSec_;
 
+  // promise to coordinate flight recorder dump.
+  std::promise<void> promiseFlightRecorderDump_;
+
   // Interval of check coordinated signals in ProcessGroupNCCL from other ranks
   // e.g., trigger the dump of the debugging info for timeout when notified.
   int coordCheckIntervalMilSec_;

From 277b32c93093ad35bdfb5543dbdff30273c81c9e Mon Sep 17 00:00:00 2001
From: Avik Chaudhuri <avik@meta.com>
Date: Thu, 24 Oct 2024 23:42:52 +0000
Subject: [PATCH 043/161] fix unflatten training ir test suffix (#138840)

Test Plan: none

Differential Revision: D64917214

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138840
Approved by: https://github.com/zhxchen17
---
 test/export/test_unflatten_training_ir.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/export/test_unflatten_training_ir.py b/test/export/test_unflatten_training_ir.py
index b5fb8466ed6d9..684d9a149ecfa 100644
--- a/test/export/test_unflatten_training_ir.py
+++ b/test/export/test_unflatten_training_ir.py
@@ -23,7 +23,7 @@ def make_dynamic_cls(cls):
     test_class = testing.make_test_cls_with_mocked_export(
         cls,
         cls_prefix,
-        "training_ir",
+        "_training_ir",
         mocked_training_ir_export,
         xfail_prop="_expected_failure_training_ir",
     )

From 4c91481656c7247affbc8e564c4d75b3fbbd5a79 Mon Sep 17 00:00:00 2001
From: Shuqiang Zhang <sqzhang@meta.com>
Date: Thu, 24 Oct 2024 13:49:28 -0700
Subject: [PATCH 044/161] [c10d] allow sub group to be eagerly inited even if
 default one is not (#138665)

Summary:
Currently, eager mode is applied either to all PGs or NONE of them.
There are cases where we don't want to initialize the comms for default
PG, but we still want to initialize the comms for sub PG. Now with a
device_id passed to new group, we can achieve this case
Test Plan:
newly added UT

Tags:

Resolves https://github.com/pytorch/pytorch/issues/137018

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138665
Approved by: https://github.com/kwen2501
ghstack dependencies: #138781
---
 test/distributed/test_c10d_nccl.py            | 24 +++++++++++++++++++
 torch/_C/_distributed_c10d.pyi                |  1 +
 torch/csrc/distributed/c10d/NCCLUtils.hpp     |  5 ++++
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 15 ++++++++++++
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  3 +++
 torch/csrc/distributed/c10d/init.cpp          |  4 ++++
 torch/distributed/distributed_c10d.py         | 13 +++++++++-
 7 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 2dcb0e1d2f066..027faceb43dd2 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -852,6 +852,30 @@ def test_comm_split_subgroup(self):
         self.assertEqual(tensor, original_tensor)
         dist.destroy_process_group()
 
+    @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_comm_eager_init_subgroup(self):
+        # Test `ncclCommSplit` for smaller subgroups of the world when
+        # we've passed a specific device_id to init_process_group.
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank}")
+        # default PG comm is not initialized yet
+        pg = self._create_process_group_nccl(store, self.opts())
+        backend = pg._get_backend(torch.device(device))
+        self.assertEqual(backend._is_initialized(), False)
+
+        tensor = torch.full((1,), self.rank).cuda(device)
+        new_group = c10d.new_group([0, 1], device_id=device)
+        self.assertEqual(backend.comm_split_count(), 0)
+
+        new_backend = new_group._get_backend(torch.device(device))
+        self.assertEqual(new_backend._is_initialized(), True)
+        dist.broadcast(tensor, 0, group=new_group)
+        self.assertEqual(new_backend.comm_split_count(), 0)
+        self.assertEqual(backend._is_initialized(), False)
+        torch.cuda.synchronize()
+        dist.destroy_process_group()
+
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_comm_split_group(self):
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index edd51c8987538..acf6b8cc4f3b5 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -578,6 +578,7 @@ class ProcessGroupNCCL(Backend):
     def comm_split_count(self) -> int: ...
     def _add_ephemeral_timeout(self, timeout: timedelta) -> None: ...
     def abort(self) -> None: ...
+    def _is_initialized(self) -> bool: ...
     @property
     def uid(self) -> int: ...
     @property
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 89687a794dc67..5f01109b139d9 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -445,6 +445,11 @@ class NCCLComm {
 #endif
   }
 
+  bool isInitialized() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return initialized_;
+  }
+
   bool isAborted() const {
     std::unique_lock<std::mutex> lock(mutex_);
     return aborted_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 78d050442ade5..f8f83508cb6b6 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1076,6 +1076,21 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
 #endif
 }
 
+bool ProcessGroupNCCL::isInitialized() {
+  if (devNCCLCommMap_.empty()) {
+    return false;
+  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  bool initialized = true;
+  for (const auto& [_, comm] : devNCCLCommMap_) {
+    if (!comm->isInitialized()) {
+      initialized = false;
+      break;
+    }
+  }
+  return initialized;
+}
+
 c10::intrusive_ptr<intra_node_comm::IntraNodeComm> ProcessGroupNCCL::
     initIntraNodeComm() {
   using IntraNodeComm = intra_node_comm::IntraNodeComm;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 263ec512c8c81..66c25d53c35ad 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -716,6 +716,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   void performNocolorSplit(at::Device device);
 
+  // If all comms on this PG are fully initialized, return true.
+  bool isInitialized();
+
   // This method adds a temporary extension for the timeout period,
   // applying to all collectives between the calling of this API and
   // the completion of the first collective on the GPU. While this feature
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 67cf3b581b1fb..0b28f7c183837 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -2772,6 +2772,10 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "abort",
               &::c10d::ProcessGroupNCCL::abort,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "_is_initialized",
+              &::c10d::ProcessGroupNCCL::isInitialized,
               py::call_guard<py::gil_scoped_release>());
 
   module.def(
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 9942d6ddec485..44ddef2375276 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -4722,6 +4722,7 @@ def new_group(
     pg_options=None,
     use_local_synchronization=False,
     group_desc=None,
+    device_id: Optional[torch.device] = None,
 ):
     """
     Create a new distributed group.
@@ -4774,6 +4775,9 @@ def new_group(
             in that non-member ranks don't need to call into API and don't
             join the barrier.
         group_desc (str, optional): a string to describe the process group.
+        device_id (torch.device, optional): a single, specific device
+            to "bind" this process to,  The `new_group` call will try to initialize
+            a communication backend immediately for the device if this field is given.
 
     Returns:
         A handle of distributed group that can be given to collective calls or
@@ -4797,6 +4801,7 @@ def new_group(
         None,
         use_local_synchronization=use_local_synchronization,
         group_desc=group_desc,
+        device_id=device_id,
     )
 
 
@@ -4808,6 +4813,7 @@ def _new_group_with_tag(
     pg_tag=None,
     use_local_synchronization=False,
     group_desc=None,
+    device_id: Optional[torch.device] = None,
 ):
     """
     Variant of ``new_group`` that exposes tag creation.
@@ -4818,7 +4824,12 @@ def _new_group_with_tag(
     global _world
 
     default_pg = _get_default_group()
-    device_id = default_pg.bound_device_id
+    if device_id is None:
+        device_id = default_pg.bound_device_id
+    elif default_pg.bound_device_id is not None:
+        assert (
+            device_id == default_pg.bound_device_id
+        ), "Mismatched bound device between new pg and the default pg."
     default_backend, default_store = _world.pg_map[default_pg]
     global_rank = default_pg.rank()
     global_world_size = default_pg.size()

From 1d98a526dda9b0a5fe7ef38380394a4b1ab26b19 Mon Sep 17 00:00:00 2001
From: Avik Chaudhuri <avik@meta.com>
Date: Fri, 25 Oct 2024 00:13:25 +0000
Subject: [PATCH 045/161] preserve signatures with multiple calls + buffer
 mutations (#138669)

As called out in https://github.com/pytorch/pytorch/pull/137999, preserving signatures of multiple calls when buffer mutations are present was NYI. The main problem was that intermediate values of buffers were not tracked, so couldn't be propagated statefully between multiple calls (i.e., they would need to be explicitly passed around, defeating the unlifting needed for preserving signatures).

This PR fixes this situation, by introducing module attributes that carry the necessary intermediate values of buffer mutations. In general, a buffer mutation can have several intermediate values it depends on recursively, even other buffers. So rather than tying an intermediate value with a particular buffer, we tie it with the submodules that create and read it. We install an attribute on all modules that create or read a particular intermediate value, sharing the same initial storage (i.e., initialized with the same empty tensor). For the module that creates this intermediate value, we copy the value into the corresponding attribute; and for the modules that read it, we read the corresponding attribute instead.

Another complication that needed to be addressed was that a `run_decompositions` following an `export_for_training` was not preserving module call graphs, which is needed for unflattening and, in particular, used when remapping inputs. Fortunately some existing metadata already tracks provenance of nodes, which we could use to update a module call graph after functionalization / decomposition.

Differential Revision: D64806175

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138669
Approved by: https://github.com/tugsbayasgalan
---
 test/export/test_export.py       |  41 ++++++----
 torch/export/exported_program.py |  35 +++++++-
 torch/export/unflatten.py        | 133 +++++++++++++++++++++++++++----
 3 files changed, 178 insertions(+), 31 deletions(-)

diff --git a/test/export/test_export.py b/test/export/test_export.py
index f6cc5554666b3..8b5e67ef8fe83 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -6586,25 +6586,34 @@ def forward(self, x):
         m = M()
         eager_result = m(*inp)
 
-        if not is_retracebility_test(self._testMethodName):
-            with self.assertRaisesRegex(
-                ValueError,
-                r"Found multiple calls of module n that mutate buffer n.buf",
-            ):
-                # Unflattening while preserving signatures is NYI for this case.
-                torch.export.unflatten(
-                    export(M(), inp, preserve_module_call_signature=("n",))
-                )
+        def test(ep):
+            epm = ep.module()
+            ufm = torch.export.unflatten(ep)
 
-        ep = export(M(), inp)
-        epm = ep.module()
-        ufm = torch.export.unflatten(ep)
+            exported_result = epm(*inp)
+            self.assertTrue(torch.allclose(exported_result, eager_result))
 
-        exported_result = epm(*inp)
-        self.assertTrue(torch.allclose(exported_result, eager_result))
+            unflattened_result = ufm(*inp)
+            self.assertTrue(torch.allclose(unflattened_result, eager_result))
 
-        unflattened_result = ufm(*inp)
-        self.assertTrue(torch.allclose(unflattened_result, eager_result))
+        if not is_retracebility_test(self._testMethodName):
+            test(export(M(), inp, preserve_module_call_signature=("n",)))
+            # running decompositions again should work for all IRs
+            ep = export(M(), inp, preserve_module_call_signature=("n",))
+            test(ep.run_decompositions({}))
+            if is_training_ir_test(self._testMethodName):
+                # since we run decompositions by default when testing training IR,
+                # also test training IR without running decompositions
+                strict = not is_non_strict_test(self._testMethodName)
+                ept = torch.export.export_for_training(
+                    M(),
+                    inp,
+                    strict=strict,
+                    preserve_module_call_signature=("n",),
+                )
+                test(ept)
+
+        test(export(M(), inp))
 
     def test_unflatten_multiple_graphs_shared_submodule(self):
         class N(torch.nn.Module):
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index c73eafdbcb7f5..6bddbf65163ce 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -643,6 +643,30 @@ def _common_getitem_elimination_pass(
                     node_id[node] = node.name
 
 
+def _get_updated_module_call_graph(
+    gm: torch.fx.GraphModule,
+    old_module_call_graph: List[ModuleCallEntry],
+):
+    new_module_call_graph = copy.deepcopy(old_module_call_graph)
+
+    # use node-level provenance metadata to create a map
+    # from old node names to new node names
+    provenance: Dict[str, str] = {}
+    for node in gm.graph.nodes:
+        if history := node.meta.get("from_node", []):
+            provenance[history[-1][0]] = node.name
+
+    # map old names to new names in module call signatures
+    for entry in new_module_call_graph:
+        signature = entry.signature
+        if signature is None:
+            continue
+        for x in [*signature.inputs, *signature.outputs]:
+            x.name = provenance.get(x.name, x.name)
+
+    return new_module_call_graph
+
+
 def _decompose_exported_program(
     ep,
     *,
@@ -657,6 +681,15 @@ def _decompose_exported_program(
         joint_loss_index=joint_loss_index,
     )
 
+    # The signatures of ep.module_call_graph refer to input / output nodes of
+    # the original graph module. However, the new graph module may have
+    # new nodes due to decompositions. So we need to update these signatures
+    # in the decomposed exported program's module_call_graph.
+    new_module_call_graph = _get_updated_module_call_graph(
+        gm,
+        ep.module_call_graph,
+    )
+
     # TODO unfortunately preserving graph-level metadata is not
     # working well with aot_export. So we manually copy it.
     # (The node-level meta is addressed above.)
@@ -673,7 +706,7 @@ def _decompose_exported_program(
         graph_signature=new_graph_signature,
         state_dict=ep.state_dict,
         range_constraints=new_range_constraints,
-        module_call_graph=copy.deepcopy(ep.module_call_graph),
+        module_call_graph=new_module_call_graph,
         example_inputs=ep.example_inputs,
         constants=ep.constants,
     )
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 1b34fd805e08c..76e804716b48a 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -19,6 +19,7 @@
 from torch.export.exported_program import (
     ConstantArgument,
     ExportedProgram,
+    ExportGraphSignature,
     InputKind,
     ModuleCallSignature,
     SymIntArgument,
@@ -219,19 +220,6 @@ def __init__(
         if export_module.graph_signature.backward_signature is not None:
             raise ValueError("Unflattening on JointExportModule NYI")
 
-        preserved_module_targets_with_multiple_calls = [
-            entry.fqn.split("@")[0]
-            for entry in export_module.module_call_graph
-            if "@" in entry.fqn
-        ]
-        for buf in export_module.graph_signature.buffers_to_mutate.values():
-            for fqn in preserved_module_targets_with_multiple_calls:
-                if buf.startswith(fqn + "."):
-                    raise ValueError(
-                        f"Found multiple calls of module {fqn} that mutate buffer {buf}. "
-                        "Unflattening while preserving signatures is NYI for this case."
-                    )
-
         fqn_list = [entry.fqn for entry in export_module.module_call_graph]
         assert fqn_list[0] == ""
         export_graph = deepcopy(export_module.graph)
@@ -245,7 +233,15 @@ def __init__(
         self._run_with_interpeter = RUN_WITH_INTERPRETER
 
         _inplace_buffer_mutations(export_graph, self.graph_signature)
+
+        self.ivals = _IVals()
+        # record any intermediate value x that is used, with the modules that used it,
+        # and generate instructions to read the corresponding attribute
         seen_modules = _outline_submodules(export_graph, self)
+        # for each read intermediate value x, find the module that created it,
+        # and generate instructions to update the corresponding attribute;
+        # finally, initialize all these attributes
+        self.ivals.create(seen_modules.values())
 
         self.range_constraints = export_module.range_constraints
         self.equality_constraints: List = []
@@ -584,7 +580,10 @@ def unflatten(
     return UnflattenedModule(module, flat_args_adapter)
 
 
-def _inplace_buffer_mutations(graph: torch.fx.Graph, graph_signature) -> None:
+def _inplace_buffer_mutations(
+    graph: torch.fx.Graph,
+    graph_signature: ExportGraphSignature,
+) -> None:
     """Transform buffer mutations from their functionalized form into a copy_
     node in the graph.
 
@@ -784,8 +783,10 @@ def __init__(
 
         if module is not None:
             self.module = module
+            self.ivals = module.ivals if hasattr(module, "ivals") else {}
         else:
             self.module = InterpreterModule(torch.fx.Graph())
+            self.ivals = parent.ivals
 
         self.graph = self.module.graph
 
@@ -948,6 +949,10 @@ def remap_input(self, x):
             # if module call signature needs to be preserved
             self.copy_sym_call_function(x)
             return self.node_map[x]
+        elif self.module_call_graph.get(self.fqn) is not None:
+            # x is an ival that is not in placeholders, so create a
+            # get_attr node corresponding to attribute __ival__x
+            return self.ivals.read(self.fqn, self.graph, x)
         else:
             raise RuntimeError(
                 f"Could not run remap_input() on op type: {x.op} for node {x}"
@@ -1198,6 +1203,106 @@ def _reorder_submodules(
         parent.register_module(name, child)
 
 
+class _IVals:
+    """
+    Collect the intermediate values of buffer mutations in a graph,
+    along with the module call fqns that create and use them. Later,
+    in each fqn associated with an intermediate value we will install
+    a corresponding attribute, so that it can be updated and read.
+
+    Example: in the following graph, suppose that buf_in and buf_out
+    are the input and output values of a buffer.
+
+        buf_in = placeholder()
+        ...
+        ival1 = f0(buf_in, ...)  # inside self.n0(...)
+        ...
+        ival2 = f1(ival1, ...)  # inside self.n1(...)
+        ...
+        buf_out = f2(ival2, ...)  # inside self.n2(...)
+        return buf_out, ...
+
+    Here ival1 and ival2 are intermediate values created inside
+    calls to n0 and n1 respectively, and used inside calls to
+    n1 and n2 respectively.
+
+    Thus our analysis will produce {ival1: {n0, n1}, ival2: {n1, n2}}.
+    """
+
+    def __init__(self):
+        # ival node name -> set of fqns that create and use it
+        self.fqns = defaultdict(set)
+        # ival node name -> tensor storage for corresponding attribute
+        self.storage = {}
+
+    def read(self, fqn, graph, node):
+        """
+        Read attribute corresponding to a given intermediate value.
+        """
+        # to read ival x, get attribute __ival__x
+        with graph.inserting_before(None):
+            ival_node = graph.get_attr("__ival__" + node.name, type_expr=node.type)
+            ival_node.meta = copy.copy(node.meta)
+
+        if node.name not in self.storage:
+            # create empty tensor matching fake, using a cache
+            # to ensure the same tensor is returned per ival_name
+            fake = node.meta["val"]
+            self.storage[node.name] = torch.empty(fake.shape, dtype=fake.dtype)
+        self.fqns[node.name].add(fqn)
+
+        return ival_node
+
+    def update(self, fqn, graph, node):
+        """
+        Update attribute corresponding to a given intermediate value.
+        """
+        self.fqns[node.name].add(fqn)
+
+        # to update ival x, get attribute __ival__x and copy x to __ival__x
+        with graph.inserting_after(node):
+            ival_node = graph.get_attr("__ival__" + node.name, type_expr=node.type)
+            ival_node.meta = copy.copy(node.meta)
+        with graph.inserting_after(ival_node):
+            new_ival_node = graph.create_node(
+                "call_function", torch.ops.aten.copy_, (ival_node, node)
+            )
+            new_ival_node.meta = copy.copy(node.meta)
+
+    def create(self, partitions):
+        """
+        Update attributes corresponding to intermediate values that were read.
+        Finally, initialize attributes in all modules that read or update
+        corresponding intermediate values.
+        """
+
+        entries = []
+        for shared_submodules in partitions:
+            for entry in shared_submodules:
+                entries.append(entry)
+                graph = entry.module.graph
+                for node in graph.nodes:
+                    if node.name in self.storage:
+                        self.update(entry.fqn, graph, node)
+
+        # fqn -> list of ival node names read or updated through it
+        ivals = defaultdict(list)
+        for name, fqns in self.fqns.items():
+            for fqn in fqns:
+                ivals[fqn].append(name)
+
+        for entry in entries:
+            for name in ivals[entry.fqn]:
+                ival_name = f"__ival__{name}"
+                # for a ival named x created in module call m,
+                # create attribute m.__ival__x, initially empty
+                setattr(
+                    entry.module,
+                    ival_name,
+                    self.storage[name],
+                )
+
+
 def _deduplicate_modules(partitions):
     for shared_submodules in partitions:
         for i, entry in enumerate(shared_submodules):

From 2f4af0f4e683bc2e76301f6738eca57fa540af98 Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Fri, 25 Oct 2024 00:25:49 +0000
Subject: [PATCH 046/161] [Profiler] Disable Dynamo-Sensitive Profiler Tests
 (#138762)

Summary: During compilation, a profiler context gets ignored so we should temporarily turn off tests that are failing due to dynamo. Once profiler integration with dynamo is introduced we can reintroduce these tests

Test Plan: Make sure CI is passing again

Differential Revision: D64867447

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138762
Approved by: https://github.com/davidberard98
---
 test/profiler/test_execution_trace.py | 1 +
 test/profiler/test_profiler.py        | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/test/profiler/test_execution_trace.py b/test/profiler/test_execution_trace.py
index 855db184dc2ee..da52d17845c63 100644
--- a/test/profiler/test_execution_trace.py
+++ b/test/profiler/test_execution_trace.py
@@ -121,6 +121,7 @@ def get_kineto_rf_ids(self, events: List[Json]) -> List[int]:
 
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     @skipIfHpu
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_execution_trace_with_kineto(self, device):
         trace_called_num = 0
 
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index cd05a18ef84e0..ba9cbd79bb817 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -337,6 +337,7 @@ def extract(pattern: str):
     )
     @serialTest()
     @parametrize("work_in_main_thread", [True, False])
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_source_multithreaded(self, name, thread_spec, work_in_main_thread):
         """Test various threading configurations.
 
@@ -1452,6 +1453,7 @@ def test_nested_tensor_with_shapes(self):
 
     @patch.dict(os.environ, {"KINETO_USE_DAEMON": "1"})
     @patch.dict(os.environ, {"KINETO_DAEMON_INIT_DELAY_S": "1"})
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
     def test_kineto_profiler_with_environment_variable(self):
         script = """
 import torch

From ed9169df98f2c1483ec1cfa69540fcb4eef1b9a1 Mon Sep 17 00:00:00 2001
From: Yifu Wang <yifu@fb.com>
Date: Wed, 23 Oct 2024 16:00:54 -0700
Subject: [PATCH 047/161] Removed the typing information for already deleted
 ProcessGroupCudaP2P (#138753)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138753
Approved by: https://github.com/weifengpy
---
 torch/_C/_distributed_c10d.pyi | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index acf6b8cc4f3b5..fea0f54f53848 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -2,7 +2,7 @@
 # mypy: disable-error-code="type-arg"
 from datetime import timedelta
 from enum import Enum
-from typing import Any, Optional, overload
+from typing import Any, overload
 
 import torch
 from torch import Tensor
@@ -663,30 +663,3 @@ class _SymmetricMemory:
     def barrier(self, channel: int = 0) -> None: ...
     def put_signal(self, dst_rank: int, channel: int = 0) -> None: ...
     def wait_signal(self, src_rank: int, channel: int = 0) -> None: ...
-
-class ProcessGroupCudaP2P(Backend):
-    class Options:
-        nccl_options: Optional[ProcessGroupNCCL.Options]
-        buffer_size: Optional[int]
-
-        def __init__(self) -> None: ...
-
-    def __init__(
-        self,
-        store: Store,
-        rank: int,
-        size: int,
-        options: ProcessGroupCudaP2P.Options,
-    ) -> None: ...
-    def is_p2p_available(self) -> bool: ...
-    def get_buffer_size(self) -> int: ...
-    def stream(self) -> torch.cuda.Stream: ...
-    def intra_node_barrier(self) -> Work: ...
-    def get_p2p_buffer(
-        self,
-        rank: int,
-        sizes: torch.Size,
-        dtype: torch.dtype,
-        storage_offset: Optional[int] = 0,
-    ) -> torch.Tensor: ...
-    def _shutdown(self) -> None: ...

From f737e3fe2f3d089a5b92beb9a397aaad2d8c7d2a Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@fb.com>
Date: Wed, 23 Oct 2024 15:44:55 -0700
Subject: [PATCH 048/161] [inductor] Fix ReinterpretView call in TMADescriptor
 IR (#138759)

As a result of #137768, `ReinterpretView` call in the `TMADescriptor`
has become invalid. This leads to some TMA tests breaking in
test_triton_kernels.py. In this PR, we fix this.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138759
Approved by: https://github.com/Chillee, https://github.com/eellison
---
 torch/_inductor/ir.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 6000b9a32b2c0..53d06748ff2d8 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -5352,7 +5352,12 @@ def __init__(
             # link back to the underlying tensor in terms of ownership
             # to avoid getting the underlying tensor deleted *before*
             # the TMADescriptor node can be deleted.
-            NonOwningLayout(ReinterpretView(tensor, tensor.get_layout())),
+            NonOwningLayout(
+                ReinterpretView(
+                    data=tensor,
+                    layout=tensor.get_layout(),
+                )
+            ),
             inputs,
             tuple(constant_args),
             None,

From 9425c0767d358e1db51176e8a66d6b63ac6b01be Mon Sep 17 00:00:00 2001
From: chilli <chilli@meta.com>
Date: Wed, 23 Oct 2024 22:54:39 -0700
Subject: [PATCH 049/161] Fix free symbol handling in FlexAttention (#138794)

Fixes https://github.com/pytorch/pytorch/issues/136196

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138794
Approved by: https://github.com/Skylion007
ghstack dependencies: #138733
---
 .../ATen/functorch/BatchRulesScatterOps.cpp   | 22 ++++-----
 test/inductor/test_flex_attention.py          | 45 +++++++++++++++++++
 torch/_inductor/select_algorithm.py           |  8 +++-
 3 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 2fec0c3158da3..8f2738552310d 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -235,7 +235,7 @@ std::tuple<Tensor, std::optional<int64_t>> index_batch_rule(
   bool advanced_indices_are_adjacent = are_advanced_indices_adjacent(indices);
 
   // Step 1
-  const auto batched_indices = batchIndices(indices, indices_bdims, self_.size(0), self_bdim);
+  const auto batched_indices = batchIndices(indices, indices_bdims, self_.sym_size(0), self_bdim);
   auto num_leading_nones = get_num_leading_nones(indices);
   auto max_index_dim = get_max_index_logical_dim(indices, indices_bdims);
 
@@ -841,26 +841,26 @@ std::tuple<Tensor, std::optional<int64_t>> gather_batch_rule(
   return std::make_tuple(result, 0);
 }
 
-Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t dim) {
+Tensor get_expanded_index(const Tensor& index, SymIntArrayRef self_size, int64_t dim) {
   if (index.dim() == 0) {
-    return index.expand(self_size);
+    return index.expand_symint(self_size);
   }
   dim = maybe_wrap_dim(dim, static_cast<int64_t>(self_size.size()));
 
   // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1]
   // to reshape index_
-  auto idx_size = index.size(0);  // get non-batch size of index tensor
+  auto idx_size = index.sym_size(0);  // get non-batch size of index tensor
   Tensor index_;
   {
-    VmapDimVector new_index_shape(self_size.size(), 1);
+    VmapSymDimVector new_index_shape(self_size.size(), 1);
     new_index_shape[dim] = idx_size;
-    index_ = index.view(new_index_shape);
+    index_ = index.view_symint(new_index_shape);
   }
   // Now apply expand to index_
   {
-    VmapDimVector new_index_shape = {self_size.begin(), self_size.end()};
+    VmapSymDimVector new_index_shape = {self_size.begin(), self_size.end()};
     new_index_shape[dim] = idx_size;
-    index_ = index_.expand(new_index_shape);
+    index_ = index_.expand_symint(new_index_shape);
   }
   return index_;
 }
@@ -869,7 +869,7 @@ Tensor index_select_decomp(const Tensor &self, int64_t dim, const Tensor &index)
 {
   Tensor index_ = index;
   if (self.dim() > index.dim()) {
-    index_ = get_expanded_index(index, self.sizes(), dim);
+    index_ = get_expanded_index(index, self.sym_sizes(), dim);
   }
 
   auto result = at::gather(self, dim, index_);
@@ -893,7 +893,7 @@ Tensor index_copy_decomp(
 {
   Tensor index_ = index;
   if (self.dim() > index.dim()) {
-    index_ = get_expanded_index(index, self.sizes(), dim);
+    index_ = get_expanded_index(index, self.sym_sizes(), dim);
   }
 
   return at::scatter(self, dim, index_, source);  ;
@@ -909,7 +909,7 @@ Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
                             std::optional<int64_t> end, int64_t step)
 {
   auto idx = at::arange(start.value_or(0), end.value_or(self.size(dim)), step, self.options().dtype(kLong));
-  idx = get_expanded_index(idx, self.sizes(), dim);
+  idx = get_expanded_index(idx, self.sym_sizes(), dim);
   return at::scatter(self, dim, idx, src);
 }
 
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 2c223fb087169..25fe73aaad246 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -2567,6 +2567,51 @@ def mask_mod(b, h, q, kv):
         ):
             torch.compile(flex_attention)(query, key, value, block_mask=block_mask)
 
+    @supported_platform
+    def test_free_symbol_dynamic(self):
+        def batch_flip_causal(b, h, q_idx, kv_idx):
+            return (q_idx >= kv_idx) & (b % 2 == 0)
+
+        class SimpleAttention(torch.nn.Module):
+            def __init__(self, dim=512, n_head=8):
+                super().__init__()
+                self.qkv = torch.nn.Linear(dim, 3 * dim)
+                self.n_head = n_head
+                self.head_dim = dim // n_head
+
+            def forward(self, x, block_mask=None):
+                B, T, C = x.size()
+                qkv = self.qkv(x).view(B, T, 3, self.n_head, self.head_dim)
+                qkv = qkv.permute(2, 0, 3, 1, 4)
+                q, k, v = qkv
+                y = flex_attention(q, k, v, block_mask=block_mask)
+                return y.transpose(1, 2).contiguous().view(B, T, C)
+
+        model = SimpleAttention().cuda()
+        model.compile(mode="default", dynamic=True)
+        sequence_len = 256
+
+        # Test different batch shapes with dense masks
+        torch._dynamo.reset()
+        for batch_shape in [4, 16, 32]:
+            # Create dense mask
+            rand_mask = torch.randint(0, 2, (batch_shape, sequence_len)).cuda().bool()
+            block_mask = torch.compile(create_block_mask, dynamic=True)(
+                B=batch_shape,
+                BLOCK_SIZE=128,
+                mask_mod=lambda b, h, q_idx, kv_idx: ~rand_mask[b, q_idx],
+                H=None,
+                Q_LEN=sequence_len,
+                KV_LEN=sequence_len,
+                device="cuda",
+            )
+
+            # Run forward pass
+            x = torch.randn(batch_shape, sequence_len, 512).cuda()
+            y = model(x, block_mask=block_mask)
+
+        self.assertEqual(torch._dynamo.utils.counters["aot_autograd"]["ok"], 2)
+
     @supported_platform
     def test_fw_bw_graph_correctness(self):
         cnt = CompileCounterWithBackend("aot_eager")
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 8e06c43e37416..48ff237c9876d 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -376,8 +376,14 @@ def modification(
             subgraph = self.subgraphs[subgraph_number]
 
             def add_input(name):
+                # This also implicitly adds name as an input to the kernel
                 return self.args.input(name)
 
+            def print_and_rename_indexing(index):
+                # This also implicitly adds the indexing symbols as an input to
+                # the kernel
+                return self.kexpr(self.rename_indexing(index))
+
             name = f"PlaceholderSubstitution_{subgraph_number}"
 
             class PlaceholderSubstitution(V.WrapperHandler):  # type: ignore[name-defined]
@@ -387,7 +393,7 @@ def load(self, name: str, index: sympy.Expr):
                     if name not in fixed_inputs:
                         # If it's not a fixed input, it's a load from a captured
                         # tensor
-                        index_str = outer_self.kexpr(index)
+                        index_str = print_and_rename_indexing(index)
                         var = add_input(name)
                         return f"tl.load({var} + {index_str})"
 

From 36c6ad71ba9fd00a85838ac328fd9b01e440d4b6 Mon Sep 17 00:00:00 2001
From: Will Feng <yf225@cornell.edu>
Date: Thu, 24 Oct 2024 12:45:54 -0700
Subject: [PATCH 050/161] [tlparse] Add `dynamo_graph_break_reason` logging to
 trace_structured (#138778)

A common challenge during torch.compile enablement is to answer user's question: "where is the graph break?". This PR will help make it easier to answer by surfacing graph breaks and their corresponding user stack trace / compiler stack trace in a direct link e.g. `0_0_0/dynamo_graph_break_reason_0.txt` from tlparse index.html.

![image](https://github.com/user-attachments/assets/79cd43f5-af14-4d08-9d5b-cb47d8203851)

![image](https://github.com/user-attachments/assets/23233ee2-0d56-4526-bf9a-d22c337c4d18)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138778
Approved by: https://github.com/ezyang
---
 test/dynamo/test_structured_trace.py |  6 ++++++
 torch/_dynamo/convert_frame.py       | 12 ++++++++++--
 torch/_dynamo/exc.py                 |  8 ++++++++
 torch/_dynamo/symbolic_convert.py    | 26 ++++++++++++++++++++------
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index e9631268b14e8..c0999b155ba0a 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -332,12 +332,14 @@ def test_example_training_fn(self):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack1']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
@@ -347,6 +349,7 @@ def test_example_training_fn(self):
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@@ -470,6 +473,7 @@ def forward(self, x):
                 self.buffer.getvalue(),
                 """\
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -506,6 +510,7 @@ def forward(self, x):
                 self.buffer.getvalue(),
                 """\
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -579,6 +584,7 @@ def fn(x):
             self.buffer.getvalue(),
             """\
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 535095d015aa1..64d884ff33e5b 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -1200,9 +1200,17 @@ def __call__(
                         user_stack_formatted = "".join(
                             traceback.format_list(user_stack)
                         )
+                        user_stack_trace = f"Graph break: skip: from user code at:\n{user_stack_formatted}"
+                        torch._logging.trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "dynamo_graph_break_reason",
+                                "encoding": "string",
+                            },
+                            payload_fn=lambda: f"{user_stack_trace}\n{traceback.format_exc()}",
+                        )
                         graph_break_log.debug(
-                            "Graph break: skip: from user code at:\n%s",
-                            user_stack_formatted,
+                            user_stack_trace,
                             exc_info=True,
                         )
 
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 7428f3c7c91f5..fad5cb2f35170 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -288,6 +288,14 @@ def unimplemented_with_warning(e: Exception, code, msg: str) -> NoReturn:
     # exception, its ok to fallback to eager but not silently. Here, we can use
     # this function to log the message and the stack trace.
     graph_break_msg = format_error_msg_verbose(e, code)
+    torch._logging.trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "dynamo_graph_break_reason",
+            "encoding": "string",
+        },
+        payload_fn=lambda: graph_break_msg,
+    )
     graph_breaks_log.debug("%s", graph_break_msg)
     log.warning(msg)
     unimplemented(msg, from_exc=e)
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index b8cfdae4bb23d..40f05e94891cb 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -377,6 +377,25 @@ def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
             code_options["co_firstlineno"],
         )
 
+    user_stack_formatted = "".join(traceback.format_list(user_stack))
+    user_stack_trace = (
+        "Graph break in user code at %s:%s\nReason: %s\nUser code traceback:\n%s"  # noqa: UP031
+        % (
+            frame_loc[0],
+            frame_loc[1],
+            reason,
+            user_stack_formatted,
+        )
+    )
+    torch._logging.trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "dynamo_graph_break_reason",
+            "encoding": "string",
+        },
+        payload_fn=lambda: f"{user_stack_trace}\n{traceback.format_exc() if exc_info else ''}",
+    )
+
     # torch._dynamo.explain() formats this a little nicer, and presents a slightly
     # more actionable user code pointer
     if (
@@ -384,16 +403,11 @@ def log_graph_break(code_options, reason="", exc_info=False, user_stack=None):
         and not explain
         and graph_break_dup_warning_checker.add(frame_loc)
     ):
-        user_stack_formatted = "".join(traceback.format_list(user_stack))
         # This log line MUST contain the string "Graph break in user code",
         # This log line is exercised from
         #   python test/dynamo/test_exc.py -k test_graph_break_log
         graph_break_log.debug(
-            "Graph break in user code at %s:%s\nReason: %s\nUser code traceback:\n%s",
-            frame_loc[0],
-            frame_loc[1],
-            reason,
-            user_stack_formatted,
+            user_stack_trace,
             exc_info=exc_info,
         )
     else:

From 94e341c6a30a2fb7b13e379d37eddfd26b3134fd Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Wed, 23 Oct 2024 21:18:09 -0700
Subject: [PATCH 051/161] [user triton] fix codegen for tl.constexpr globals
 (#138757)

Fixes #138509

tl.constexpr globals would be codegen-ed as `constexpr()` instead of `tl.constexpr()` if they were un-annotated. This fixes the issue (and adds a test). The correct handling was already added but the corrected string was not being used in the un-annotated branch.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138757
Approved by: https://github.com/oulgen
---
 test/inductor/test_triton_kernels.py | 28 ++++++++++++++++++++++++++++
 torch/_inductor/codegen/wrapper.py   |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index f1475dae694fe..759f46e3c8ac3 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -66,6 +66,7 @@ def _triton_get_ast_equal_to_str(params):
     CONSTANT_C: tl.constexpr = 4
     STRING_CONSTANT_C: tl.constexpr = "CONSTANT_C"
     BOOL_CONSTANT_C: tl.constexpr = True
+    FLOAT_CONSTANT_C = tl.constexpr(3.14)  # intentionally un-annotated
 
 
 class KernelTests(torch._inductor.test_case.TestCase):
@@ -2171,6 +2172,33 @@ def sin_triton(x, out):
         sin_triton_compiled(None, out_compiled)
         self.assertEqual(out, out_compiled)
 
+    @requires_gpu
+    def test_triton_kernel_global_constexpr(self):
+        @triton.jit
+        def triton_(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
+            pid = tl.program_id(0)
+            offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+            x = tl.load(in_ptr + offsets)
+            output = x + FLOAT_CONSTANT_C
+            tl.store(out_ptr + offsets, output)
+
+        def fn(x):
+            y = torch.empty_like(x)
+            BLOCK_SIZE = 256
+            grid = (triton.cdiv(x.numel(), BLOCK_SIZE),)
+            triton_[grid](x, y, BLOCK_SIZE)
+            return y
+
+        # make sure FLOAT_CONSTANT_C is NOT annotated
+        self.assertFalse("FLOAT_CONSTANT_C" in globals().get("__annotations__", {}))
+        # sanity check: STRING_CONSTANT_C _should_ be annotated
+        self.assertTrue("STRING_CONSTANT_C" in globals().get("__annotations__", {}))
+
+        x = torch.randn(512, device=GPU_TYPE)
+        expected = x + 3.14
+        actual = torch.compile(fn)(x)
+        self.assertEqual(expected, actual)
+
 
 def make_mutation_test(fn):
     @requires_gpu
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 367f707b10cf4..ec6d72b93bfe6 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -1476,7 +1476,7 @@ def traverse(cur_kernel):
                                 f"{symbol_name}{annotation_code} = {symbol_str}"
                             )
                         else:
-                            compile_wrapper.writeline(f"{symbol_name} = {symbol!r}")
+                            compile_wrapper.writeline(f"{symbol_name} = {symbol_str}")
                         symbols_included.add(symbol_name)
                     elif (
                         symbol_name in unqualified_loads

From 907f001a682c4e0893fa2c999ac4c9ff16581835 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 25 Oct 2024 03:53:25 +0000
Subject: [PATCH 052/161] Bump onnx from 1.16.1 to 1.17.0 in /.ci/docker
 (#138719)

Bumps [onnx](https://github.com/onnx/onnx) from 1.16.1 to 1.17.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/onnx/onnx/releases">onnx's releases</a>.</em></p>
<blockquote>
<h2>v1.17.0</h2>
<p>ONNX v1.17.0 is now available with exciting new features! We would like to thank everyone who contributed to this release!
Please visit <a href="https://onnx.ai/">onnx.ai</a> to learn more about ONNX and associated projects.</p>
<h1>Key Updates</h1>
<h2>ai.onnx Opset 22</h2>
<ul>
<li>Update to support bfloat16:
<ul>
<li><a href="https://onnx.ai/onnx/operators/onnx__Acos.html#acos-22">Acos</a>, <a href="https://onnx.ai/onnx/operators/onnx__Acosh.html#acosh-22">Acosh</a>, <a href="https://onnx.ai/onnx/operators/onnx__Asin.html#asin-22">Asin</a>, <a href="https://onnx.ai/onnx/operators/onnx__Asinh.html#asinh-22">Asinh</a>, <a href="https://onnx.ai/onnx/operators/onnx__Atan.html#atan-22">Atan</a>, <a href="https://onnx.ai/onnx/operators/onnx__Atanh.html#atanh-22">Atanh</a>, <a href="https://onnx.ai/onnx/operators/onnx__AveragePool.html#averagepool-22">AveragePool</a>, <a href="https://onnx.ai/onnx/operators/onnx__Bernoulli.html#bernoulli-22">Bernoulli</a>, <a href="https://onnx.ai/onnx/operators/onnx__Conv.html#conv-22">Conv</a>, <a href="https://onnx.ai/onnx/operators/onnx__ConvTranspose.html#convtranspose-22">ConvTranspose</a>, <a href="https://onnx.ai/onnx/operators/onnx__Cos.html#cos-22">Cos</a>, <a href="https://onnx.ai/onnx/operators/onnx__Cosh.html#cosh-22">Cosh</a>, <a href="https://onnx.ai/onnx/operators/onnx__DeformConv.html#deformconv-22">DeformConv</a>, <a href="https://onnx.ai/onnx/operators/onnx__Det.html#det-22">Det</a>, <a href="https://onnx.ai/onnx/operators/onnx__Dropout.html#dropout-22">Dropout</a>, <a href="https://onnx.ai/onnx/operators/onnx__Elu.html#elu-22">Elu</a>, <a href="https://onnx.ai/onnx/operators/onnx__EyeLike.html#eyelike-22">EyeLike</a>, <a href="https://onnx.ai/onnx/operators/onnx__GRU.html#gru-22">GRU</a>, <a href="https://onnx.ai/onnx/operators/onnx__GlobalAveragePool.html#globalaveragepool-22">GlobalAveragePool</a>, <a href="https://onnx.ai/onnx/operators/onnx__GlobalLpPool.html#globallppool-22">GlobalLpPool</a>, <a href="https://onnx.ai/onnx/operators/onnx__GlobalMaxPool.html#globalmaxpool-22">GlobalMaxPool</a>, <a href="https://onnx.ai/onnx/operators/onnx__GridSample.html#gridsample-22">GridSample</a>, <a href="https://onnx.ai/onnx/operators/onnx__HardSigmoid.html#hardsigmoid-22">HardSigmoid</a>, <a href="https://onnx.ai/onnx/operators/onnx__HardSwish.html#hardswish-22">HardSwish</a>, <a href="https://onnx.ai/onnx/operators/onnx__InstanceNormalization.html#instancenormalization-22">InstanceNormalization</a>, <a href="https://onnx.ai/onnx/operators/onnx__LSTM.html#lstm-22">LSTM</a>, <a href="https://onnx.ai/onnx/operators/onnx__LpNormalization.html#lpnormalization-22">LpNormalization</a>, <a href="https://onnx.ai/onnx/operators/onnx__LpPool.html#lppool-22">LpPool</a>, <a href="https://onnx.ai/onnx/operators/onnx__MaxPool.html#maxpool-22">MaxPool</a>, <a href="https://onnx.ai/onnx/operators/onnx__MaxRoiPool.html#maxroipool-22">MaxRoiPool</a>, <a href="https://onnx.ai/onnx/operators/onnx__MaxUnpool.html#maxunpool-22">MaxUnpool</a>, <a href="https://onnx.ai/onnx/operators/onnx__Mish.html#mish-22">Mish</a>, <a href="https://onnx.ai/onnx/operators/onnx__Multinomial.html#multinomial-22">Multinomial</a>, <a href="https://onnx.ai/onnx/operators/onnx__NegativeLogLikelihoodLoss.html#negativeloglikelihoodloss-22">NegativeLogLikelihoodLoss</a>, <a href="https://onnx.ai/onnx/operators/onnx__RNN.html#rnn-22">RNN</a>, <a href="https://onnx.ai/onnx/operators/onnx__RandomNormal.html#randomnormal-22">RandomNormal</a>, <a href="https://onnx.ai/onnx/operators/onnx__RandomNormalLike.html#randomnormallike-22">RandomNormalLike</a>, <a href="https://onnx.ai/onnx/operators/onnx__RandomUniform.html#randomuniform-22">RandomUniform</a>, <a href="https://onnx.ai/onnx/operators/onnx__RandomUniformLike.html#randomuniformlike-22">RandomUniformLike</a>, <a href="https://onnx.ai/onnx/operators/onnx__RoiAlign.html#roialign-22">RoiAlign</a>, <a href="https://onnx.ai/onnx/operators/onnx__Round.html#round-22">Round</a>, <a href="https://onnx.ai/onnx/operators/onnx__Selu.html#selu-22">Selu</a>, <a href="https://onnx.ai/onnx/operators/onnx__Sin.html#sin-22">Sin</a>, <a href="https://onnx.ai/onnx/operators/onnx__Sinh.html#sinh-22">Sinh</a>, <a href="https://onnx.ai/onnx/operators/onnx__Softplus.html#softplus-22">Softplus</a>, <a href="https://onnx.ai/onnx/operators/onnx__Softsign.html#softsign-22">Softsign</a>, <a href="https://onnx.ai/onnx/operators/onnx__Tan.html#tan-22">Tan</a>, <a href="https://onnx.ai/onnx/operators/onnx__ThresholdedRelu.html#thresholdedrelu-22">ThresholdedRelu</a></li>
</ul>
</li>
</ul>
<h2>Python Changes</h2>
<ul>
<li>Support for numpy &gt;= 2.0</li>
</ul>
<h1>Bug fixes and infrastructure improvements</h1>
<ul>
<li>Fix Check URLs errors <a href="https://redirect.github.com/onnx/onnx/pull/5972">5972</a></li>
<li>Use CMAKE_PREFIX_PATH in finding libprotobuf <a href="https://redirect.github.com/onnx/onnx/pull/5975">5975</a></li>
<li>Bump main VERSION_NUMBER to 1.17.0 <a href="https://redirect.github.com/onnx/onnx/pull/5968">5968</a></li>
<li>Fix source and pip tar.gz builds on s390x systems <a href="https://redirect.github.com/onnx/onnx/pull/5984">5984</a></li>
<li>Fix unique_name <a href="https://redirect.github.com/onnx/onnx/pull/5992">5992</a></li>
<li>Fix SegFault bug in shape inference <a href="https://redirect.github.com/onnx/onnx/pull/5990">5990</a></li>
<li>Fix onnx.compose when connecting subgraphs <a href="https://redirect.github.com/onnx/onnx/pull/5991">5991</a></li>
<li>Fix conversion from split 11 to split 18 <a href="https://redirect.github.com/onnx/onnx/pull/6020">6020</a></li>
<li>Update error messages for NegativeLogLikelihoodLoss inference function <a href="https://redirect.github.com/onnx/onnx/pull/6021">6021</a></li>
<li>Generalize input/output number check in shape inference <a href="https://redirect.github.com/onnx/onnx/pull/6005">6005</a></li>
<li>Replace rank inference with shape inference for Einsum op <a href="https://redirect.github.com/onnx/onnx/pull/6010">6010</a></li>
<li>build from source instruction with latest cmake change <a href="https://redirect.github.com/onnx/onnx/pull/6038">6038</a></li>
<li>Handle OneHot's depth value during shape inference <a href="https://redirect.github.com/onnx/onnx/pull/5963">5963</a></li>
<li>Not to install cmake in pyproject.toml on Windows <a href="https://redirect.github.com/onnx/onnx/pull/6045">6045</a></li>
<li>fix a skipped shape infer code <a href="https://redirect.github.com/onnx/onnx/pull/6049">6049</a></li>
<li>Include the &quot;.onnxtext&quot; extension in supported serialization format <a href="https://redirect.github.com/onnx/onnx/pull/6051">6051</a></li>
<li>Allow ReferenceEvaluator to return intermediate results <a href="https://redirect.github.com/onnx/onnx/pull/6066">6066</a></li>
<li>Fix 1 typo in numpy_helper.py <a href="https://redirect.github.com/onnx/onnx/pull/6041">6041</a></li>
<li>Remove benchmarking code <a href="https://redirect.github.com/onnx/onnx/pull/6076">6076</a></li>
<li>Prevent crash on import after GCC 8 builds <a href="https://redirect.github.com/onnx/onnx/pull/6048">6048</a></li>
<li>Check graph outputs are defined <a href="https://redirect.github.com/onnx/onnx/pull/6083">6083</a></li>
<li>Enable additional ruff rules <a href="https://redirect.github.com/onnx/onnx/pull/6032">6032</a></li>
<li>Add missing shape inference check for DequantizeLinear <a href="https://redirect.github.com/onnx/onnx/pull/6080">6080</a></li>
<li>Add bfloat16 to all relevant ops <a href="https://redirect.github.com/onnx/onnx/pull/6099">6099</a></li>
<li>fix(ci): install python dependencies with --only-binary :all: in manylinux <a href="https://redirect.github.com/onnx/onnx/pull/6120">6120</a></li>
<li>fix: install google-re2 with --only-binary option <a href="https://redirect.github.com/onnx/onnx/pull/6129">6129</a></li>
<li>Specify axis parameter for DequantizeLinear when input rank is 1 <a href="https://redirect.github.com/onnx/onnx/pull/6095">6095</a></li>
<li>Pin onnxruntime to 1.17.3 for release CIs <a href="https://redirect.github.com/onnx/onnx/pull/6143">6143</a></li>
<li>Fix INT4 TensorProto byte size is 5x larger than expected with negative values <a href="https://redirect.github.com/onnx/onnx/pull/6161">6161</a></li>
<li>Mitigate tarball directory traversal risks <a href="https://redirect.github.com/onnx/onnx/pull/6164">6164</a></li>
<li>Fix reference implementation for ScatterND with 4D tensors <a href="https://redirect.github.com/onnx/onnx/pull/6174">6174</a></li>
<li>Addition of group &gt; 1 in test and in backend for ConvTranspose <a href="https://redirect.github.com/onnx/onnx/pull/6175">6175</a></li>
<li>Support for bfloat16 for binary, unary operators in reference implementation <a href="https://redirect.github.com/onnx/onnx/pull/6166">6166</a></li>
<li>Refactor windows workflow to work on standard windows <a href="https://redirect.github.com/onnx/onnx/pull/6190">6190</a></li>
<li>Fix a few crashes while running shape inference <a href="https://redirect.github.com/onnx/onnx/pull/6195">6195</a></li>
<li>Update onnx to work with numpy&gt;=2.0 <a href="https://redirect.github.com/onnx/onnx/pull/6196">6196</a></li>
<li>Use sets to improve performance of dfs search <a href="https://redirect.github.com/onnx/onnx/pull/6213">6213</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/onnx/onnx/commit/b8baa8446686496da4cc8fda09f2b6fe65c2a02c"><code>b8baa84</code></a> Set version 1.17.0 for official release (<a href="https://redirect.github.com/onnx/onnx/issues/6405">#6405</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/6d77b808217f442170d105131836aa4820c0f43f"><code>6d77b80</code></a> [Cherry-Pick] Fix main url checks (<a href="https://redirect.github.com/onnx/onnx/issues/6312">#6312</a>) (<a href="https://redirect.github.com/onnx/onnx/issues/6327">#6327</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/174938d8b7d48f27b5c491626c6a474f5f5b829a"><code>174938d</code></a> [Cherry-Pick] Fix protobuf pkg 5.28.0 failing on Windows (<a href="https://redirect.github.com/onnx/onnx/issues/6342">#6342</a>) (<a href="https://redirect.github.com/onnx/onnx/issues/6347">#6347</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/f18d5931adc7b44ae5a2afd74e21ed51bcf2bc63"><code>f18d593</code></a> [Cherry-Pick] Remove unused variables (<a href="https://redirect.github.com/onnx/onnx/issues/6303">#6303</a>) (<a href="https://redirect.github.com/onnx/onnx/issues/6324">#6324</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/c58890537f466b9b294f6dd038dd826f9907e03d"><code>c588905</code></a> Set version in rel-1.17.0 to 1.17.0rc1 (<a href="https://redirect.github.com/onnx/onnx/issues/6317">#6317</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/4392c2c9ae30cd10d199bd31fc7b272a6f842824"><code>4392c2c</code></a> Prepare for rel-1.17.0 (<a href="https://redirect.github.com/onnx/onnx/issues/6281">#6281</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/cb54169e4f2b52861cf5ec546d244ea4b2d09964"><code>cb54169</code></a> Update ort filter to 1.20.0 to skip tests known to fail with ort 1.19.0 (<a href="https://redirect.github.com/onnx/onnx/issues/6306">#6306</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/99e1fd352c05c3176770080824fd7a8c474c97c0"><code>99e1fd3</code></a> Bump reviewdog/action-misspell from 1.21.0 to 1.23.0 (<a href="https://redirect.github.com/onnx/onnx/issues/6268">#6268</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/19205655059e1654ba2d44478bc3a1c75af7830f"><code>1920565</code></a> Bump ossf/scorecard-action from 2.3.3 to 2.4.0 (<a href="https://redirect.github.com/onnx/onnx/issues/6273">#6273</a>)</li>
<li><a href="https://github.com/onnx/onnx/commit/2e8f2289b91d5670e1c661ab9119178b24197219"><code>2e8f228</code></a> Bump mypy from 1.10.1 to 1.11.1 (<a href="https://redirect.github.com/onnx/onnx/issues/6275">#6275</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/onnx/onnx/compare/v1.16.1...v1.17.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=onnx&package-manager=pip&previous-version=1.16.1&new-version=1.17.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/pytorch/pytorch/network/alerts).

</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138719
Approved by: https://github.com/ezyang

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .ci/docker/requirements-ci.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 4a92c47173d48..d25a290f654aa 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -331,7 +331,7 @@ sympy==1.13.1 ; python_version >= "3.9"
 #Pinned versions:
 #test that import:
 
-onnx==1.16.1
+onnx==1.17.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:

From b999daf7a957ecd9132176544acfa8b087b84fd4 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Fri, 25 Oct 2024 05:23:08 +0000
Subject: [PATCH 053/161] Add sets to list of safe objects to de-serialize
 (#138866)

Lists, dicts and tuples are already allowed, it's a bit weird not to exclude set from the list of basic containers.

Test plan (in addition to unittest):
```python
torch.save({1, 2, 3}, "foo.pt")
torch.load("foo.pt", weights_only=True)
```

Fixes https://github.com/pytorch/pytorch/issues/138851

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138866
Approved by: https://github.com/mikaylagawarecki

Co-authored-by: mikaylagawarecki <mikaylagawarecki@gmail.com>
---
 test/test_serialization.py       | 8 ++++++++
 torch/_weights_only_unpickler.py | 1 +
 2 files changed, 9 insertions(+)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index 92854dd9b6d1a..a58e47c083176 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -4526,6 +4526,14 @@ def test_safe_globals_context_manager_weights_only(self):
         finally:
             torch.serialization.clear_safe_globals()
 
+    def test_sets_are_loadable_with_weights_only(self):
+        s = {1, 2, 3}
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(s, f)
+            f.seek(0)
+            l_s = torch.load(f, weights_only=True)
+            self.assertEqual(l_s, s)
+
     @unittest.skipIf(not torch.cuda.is_available(), "map_location loads to cuda")
     def test_tensor_subclass_map_location(self):
         t = TwoTensor(torch.randn(2, 3), torch.randn(2, 3))
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index fe437d79899da..918db8ba0be9b 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -169,6 +169,7 @@ def _get_allowed_globals():
         "torch.device": torch.device,
         "_codecs.encode": encode,  # for bytes
         "builtins.bytearray": bytearray,  # for bytearray
+        "builtins.set": set,  # for set
     }
     # dtype
     for t in torch.storage._dtype_to_storage_type_map().keys():

From ce631939f04e2ae6efee44247239702842612841 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Fri, 25 Oct 2024 05:32:38 +0000
Subject: [PATCH 054/161] [Distributed] [18/N] Fix clang-tidy warnings in
 torch/csrc/distributed/ (#138692)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138692
Approved by: https://github.com/ezyang
---
 torch/csrc/distributed/c10d/Backoff.cpp       |  2 +-
 .../distributed/c10d/CudaDMAConnectivity.cpp  |  8 ++---
 torch/csrc/distributed/c10d/GroupRegistry.cpp | 12 +++----
 torch/csrc/distributed/c10d/GroupRegistry.hpp |  2 +-
 torch/csrc/distributed/c10d/NCCLUtils.cpp     | 10 +++---
 torch/csrc/distributed/c10d/NCCLUtils.hpp     | 35 ++++++++-----------
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 20 ++++++-----
 .../distributed/c10d/ProcessGroupNCCL.hpp     | 16 ++++-----
 torch/csrc/distributed/c10d/init.cpp          |  4 ++-
 .../csrc/distributed/c10d/intra_node_comm.hpp |  2 +-
 torch/csrc/distributed/c10d/logger.cpp        |  4 +--
 .../distributed/c10d/python_comm_hook.cpp     |  1 +
 torch/csrc/distributed/c10d/reducer.cpp       |  4 +--
 torch/csrc/distributed/c10d/reducer.hpp       |  3 +-
 torch/csrc/distributed/c10d/socket.cpp        |  4 ++-
 torch/csrc/distributed/rpc/agent_utils.cpp    |  2 +-
 16 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/torch/csrc/distributed/c10d/Backoff.cpp b/torch/csrc/distributed/c10d/Backoff.cpp
index 6aadc33cbc5ea..850cb45181b91 100644
--- a/torch/csrc/distributed/c10d/Backoff.cpp
+++ b/torch/csrc/distributed/c10d/Backoff.cpp
@@ -46,7 +46,7 @@ std::chrono::milliseconds ExponentialBackoffWithJitter::nextBackoff() {
   std::chrono::milliseconds maxSampleInterval =
       currentInterval_ + randomization;
 
-  std::uniform_int_distribution<> dist(
+  std::uniform_int_distribution<int64_t> dist(
       minSampleInterval.count(), maxSampleInterval.count());
   std::chrono::milliseconds backoffInterval{dist(gen_)};
 
diff --git a/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp b/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp
index 1aaab10cf9683..1ed72a9aa116a 100644
--- a/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp
@@ -12,6 +12,7 @@ namespace {
 constexpr int max_nvlinks = 64;
 
 std::string get_bus_id(int device_idx) {
+  // NOLINTNEXTLINE(*array*)
   char bus_id[80];
   cudaDeviceProp prop{};
   C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_idx));
@@ -27,7 +28,7 @@ std::string get_bus_id(int device_idx) {
 
 struct C10_EXPORT NVLinkDetector : public c10d::DMAConnectivityDetector {
   c10::intrusive_ptr<c10d::DMAConnectivity> detect() override {
-    int num_devices;
+    int num_devices = 0;
     C10_CUDA_CHECK(cudaGetDeviceCount(&num_devices));
 
     std::vector<std::vector<int>> matrix;
@@ -74,9 +75,8 @@ struct C10_EXPORT NVLinkDetector : public c10d::DMAConnectivityDetector {
     std::vector<int> switch_link_count(num_devices, 0);
     for (int i = 0; i < num_devices; ++i) {
       for (int link = 0; link < max_nvlinks; ++link) {
-        nvmlReturn_t ret;
-        nvmlIntNvLinkDeviceType_t deviceType;
-        ret = driver_api->nvmlDeviceGetNvLinkRemoteDeviceType_(
+        nvmlIntNvLinkDeviceType_t deviceType{};
+        auto ret = driver_api->nvmlDeviceGetNvLinkRemoteDeviceType_(
             nvml_devices[i], link, &deviceType);
         if (ret != NVML_SUCCESS) {
           // We've exhausted the NVLinks connected to this device. This error
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.cpp b/torch/csrc/distributed/c10d/GroupRegistry.cpp
index b13b4fa07c28e..2a735a4c99592 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.cpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp
@@ -10,10 +10,11 @@ namespace {
 class GroupRegistry {
  public:
   void register_group(
-      const std::string& group_name,
+      std::string group_name,
       c10::intrusive_ptr<c10d::ProcessGroup> group) {
     std::unique_lock write_lock(lock_);
-    auto [_, inserted] = registry_.try_emplace(group_name, std::move(group));
+    auto [_, inserted] =
+        registry_.try_emplace(std::move(group_name), std::move(group));
     TORCH_CHECK(
         inserted,
         "A process group is already registered under the name",
@@ -70,12 +71,11 @@ bool get_thread_isolation_mode() {
 
 void register_process_group(
     const std::string& group_name,
-    c10::intrusive_ptr<c10d::ProcessGroup> group) {
+    const c10::intrusive_ptr<c10d::ProcessGroup>& group) {
   if (thread_isolation_mode) {
-    RankLocal<::GroupRegistry>::get().register_group(
-        group_name, std::move(group));
+    RankLocal<::GroupRegistry>::get().register_group(group_name, group);
   } else {
-    process_registry.register_group(group_name, std::move(group));
+    process_registry.register_group(group_name, group);
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.hpp b/torch/csrc/distributed/c10d/GroupRegistry.hpp
index b22fb1ae8faf3..dc64adeaf6618 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.hpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.hpp
@@ -10,7 +10,7 @@ bool get_thread_isolation_mode();
 
 C10_EXPORT void register_process_group(
     const std::string& group_name,
-    c10::intrusive_ptr<c10d::ProcessGroup> group);
+    const c10::intrusive_ptr<c10d::ProcessGroup>& group);
 
 C10_EXPORT c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
     const std::string& group_name);
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 3b881c2985cb3..6bbb2318ba8ab 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -176,9 +176,9 @@ bool nccl_use_nonblocking() {
 int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
   if (timeout == -2) {
-    const char* val = getenv("TORCH_NCCL_NONBLOCKING_TIMEOUT");
-    if (val && strlen(val) > 0) {
-      timeout = strtol(val, nullptr, 0);
+    const auto val = c10::utils::get_env("TORCH_NCCL_NONBLOCKING_TIMEOUT");
+    if (val.has_value() && !val.value().empty()) {
+      timeout = stoi(val.value());
     } else {
       // Default value consistent with kBackendDefaultTimeout
       timeout = 30 * 60;
@@ -353,7 +353,7 @@ void DebugInfoWriter::write(const std::string& ncclTrace) {
     return;
   }
 
-  file.write(ncclTrace.data(), ncclTrace.size());
+  file.write(ncclTrace.data(), static_cast<std::streamsize>(ncclTrace.size()));
   if (!file) {
     LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
                << filename_;
@@ -547,7 +547,7 @@ void NCCLTraceBuffer::retire_id(
       return;
     }
     if (duration.has_value()) {
-      entry->duration_ = duration.value();
+      entry->duration_ = duration;
     }
   }
 }
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 5f01109b139d9..a5099ab583f97 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -3,12 +3,11 @@
 #ifdef USE_C10D_NCCL
 
 #include <sched.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstdlib>
 
 #include <memory>
 #include <mutex>
-#include <thread>
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAEvent.h>
@@ -265,7 +264,7 @@ class TORCH_API DebugInfoWriter {
   }
 
  protected:
-  DebugInfoWriter(std::string namePrefix, int rank) {
+  DebugInfoWriter(const std::string& namePrefix, int rank) {
     filename_ = c10::str(namePrefix, rank);
   }
   std::string filename_;
@@ -278,14 +277,9 @@ class TORCH_API DebugInfoWriter {
 // RAII wrapper for NCCL communicator
 class NCCLComm {
  public:
-  explicit NCCLComm(ncclComm_t ncclComm)
-      : aborted_(false),
-        ncclAsyncErr_(ncclSuccess),
-        commFailureReason_(std::nullopt),
-        initialized_(false),
-        ncclComm_(ncclComm) {}
+  explicit NCCLComm(ncclComm_t ncclComm) : ncclComm_(ncclComm) {}
 
-  NCCLComm() : NCCLComm(nullptr) {}
+  NCCLComm() = default;
 
   ~NCCLComm() noexcept {
     // Add lock in this destructor, as aborted_ needs to be read after memory
@@ -379,6 +373,7 @@ class NCCLComm {
   NCCLComm& operator=(NCCLComm&& other) = delete;
 
   // Move constructable
+  // NOLINTNEXTLINE(.*-noexcept-move-.*)
   NCCLComm(NCCLComm&& other) {
     // Using other's lock, as it reads other's states
     // Can not use this.mutex_, as this object is being constructed.
@@ -488,7 +483,7 @@ class NCCLComm {
         " has already been registered on ncclComm_ ",
         ncclComm_);
 
-    void* handle;
+    void* handle = nullptr;
     // Use getNcclComm to make sure comm is ready before calling nccl APIs
     auto comm = getNcclComm();
     C10D_NCCL_CHECK(
@@ -544,16 +539,16 @@ class NCCLComm {
 
  protected:
   // Unique nccl_id for this communicator.
-  ncclUniqueId ncclId_;
-  bool aborted_;
+  ncclUniqueId ncclId_{};
+  bool aborted_{false};
   uint64_t ncclCommSplitCounter_{0};
-  ncclResult_t ncclAsyncErr_;
+  ncclResult_t ncclAsyncErr_{ncclSuccess};
   mutable std::mutex mutex_;
   // Rank that this communicator corresponds to.
-  int rank_;
+  int rank_{};
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for
   // better error messaging.
-  std::optional<std::string> commFailureReason_;
+  std::optional<std::string> commFailureReason_{};
   bool initialized_{false};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
@@ -572,7 +567,7 @@ struct ncclRedOpRAII {
       : op_(op), comm_(comm), premul_sum_(true) {}
   ncclRedOpRAII(const ncclRedOpRAII&) = delete;
   ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete;
-  ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() {
+  ncclRedOpRAII(ncclRedOpRAII&& tmp) noexcept : ncclRedOpRAII() {
     std::swap(tmp.op_, this->op_);
     std::swap(tmp.comm_, this->comm_);
     std::swap(tmp.premul_sum_, this->premul_sum_);
@@ -587,8 +582,8 @@ struct ncclRedOpRAII {
   operator ncclRedOp_t() const {
     return op_;
   }
-  ncclRedOp_t op_;
-  ncclComm_t comm_;
+  ncclRedOp_t op_{};
+  ncclComm_t comm_{};
   bool premul_sum_ = false;
 };
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index f8f83508cb6b6..8b29261d733e3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -305,7 +305,7 @@ static bool allocatorHooksAttached = false;
 
 std::atomic<bool> ProcessGroupNCCL::shouldDump_(false);
 
-void cacheAllocatorRegisterHook(
+static void cacheAllocatorRegisterHook(
     const c10::cuda::CUDACachingAllocator::TraceEntry& te) {
   // Register after SEGMENT_ALLOC
   if (te.action_ !=
@@ -323,7 +323,7 @@ void cacheAllocatorRegisterHook(
   }
 }
 
-void cacheAllocatorDeregisterHook(
+static void cacheAllocatorDeregisterHook(
     const c10::cuda::CUDACachingAllocator::TraceEntry& te) {
   // deregister before SEGMENT_FREE
   if (te.action_ !=
@@ -341,8 +341,9 @@ void cacheAllocatorDeregisterHook(
   }
 }
 
-std::unordered_map<std::string, std::unordered_map<std::string, std::string>>
-getNCCLCommDumpMap() {
+static std::
+    unordered_map<std::string, std::unordered_map<std::string, std::string>>
+    getNCCLCommDumpMap() {
 #if defined(IS_NCCLX) && defined(NCCL_COMM_DUMP)
   std::unordered_map<
       std::string /* ncclUniqueID */,
@@ -464,7 +465,7 @@ gil_checker_t& get_gil_checker() {
   return gil_checker;
 }
 
-std::future<bool> launchAsyncGilCheck() {
+static std::future<bool> launchAsyncGilCheck() {
   std::promise<bool> resultPromise;
   std::future<bool> resultFuture = resultPromise.get_future();
   TORCH_CHECK(get_gil_checker(), "Can't check GIL with null GIL checker");
@@ -861,12 +862,12 @@ constexpr const char* MULTI_DEVICE_ERROR_MSG =
     "ProcessGroupNCCL continues supporting multi-process and multi-thread modes.";
 
 ProcessGroupNCCL::ProcessGroupNCCL(
-    const c10::intrusive_ptr<Store>& store,
+    c10::intrusive_ptr<Store> store,
     int rank,
     int size,
     c10::intrusive_ptr<Options> options)
     : Backend(rank, size),
-      store_(store),
+      store_(std::move(store)),
       options_(std::move(options)),
 
       traceKeyStart_(getTraceStartKey("NCCL", rank)),
@@ -1286,7 +1287,8 @@ void ProcessGroupNCCL::abortCommsFromMap(
 // Note: original name of this method is `abort`. It was renamed to
 // `abortComms` to distinguish from the `abort` method below. The `abort`
 // method calls `abortComms` but does more destruction than the latter.
-bool ProcessGroupNCCL::abortComms(std::optional<std::string> abortReason) {
+bool ProcessGroupNCCL::abortComms(
+    const std::optional<std::string>& abortReason) {
   // Remove record from global ncclCommDevIdxMapMutex before aboarting,
   // so that a new cache segment would not register to already aborded
   // communicators. Note that ncclCommDevIdxMap is a global container which may
@@ -1407,7 +1409,7 @@ void ProcessGroupNCCL::terminateProcess(const std::string& errMsg) {
   LOG(FATAL) << logPrefix() << errMsg;
 }
 
-long computeDeltaMS(
+static long computeDeltaMS(
     std::chrono::time_point<std::chrono::steady_clock> start,
     std::chrono::time_point<std::chrono::steady_clock> end) {
   return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 66c25d53c35ad..5ec9ae32405f6 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -525,7 +525,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // communicator. These NCCL communicators are cached and reused if possible.
   //
   ProcessGroupNCCL(
-      const c10::intrusive_ptr<Store>& store,
+      c10::intrusive_ptr<Store> store,
       int rank,
       int size,
       c10::intrusive_ptr<Options> options = Options::create());
@@ -776,7 +776,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool dumpDebuggingInfo();
 
   // Abort all communicators on this rank.
-  bool abortComms(std::optional<std::string> abortReason = std::nullopt);
+  bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
 
  private:
   int globalRankStart;
@@ -1029,7 +1029,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::mutex mutex_;
 
   // Heartbeat of watchdog thread.
-  std::atomic_uint64_t heartbeat_;
+  std::atomic_uint64_t heartbeat_{};
 
   // The time interval used for deciding whether there is no watchdog heartbeat.
   int heartbeatTimeoutInSec_;
@@ -1048,10 +1048,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   int ncclTraceBufferSize_;
 
   // We gate the heartbeat monitor thread so that we can roll it out gradually.
-  std::atomic<bool> monitorThreadEnabled_;
+  std::atomic<bool> monitorThreadEnabled_{};
 
   // We gate the cudaEventCache so that we can roll it out gradually.
-  std::atomic<bool> cudaEventCacheEnabled_;
+  std::atomic<bool> cudaEventCacheEnabled_{};
 
   // Monitor thread which checks the heartbeat of Watchdog thread.
   // If the monitor thread finds there is no heartbeat, it will dump debug info
@@ -1074,7 +1074,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::atomic<bool> collectiveDebugInfoMode_;
 
   // Whether there are hooks pending to be fired
-  std::atomic<bool> hasPendingHooks_;
+  std::atomic<bool> hasPendingHooks_{};
 
   // This is the signal from watchdog threads to indicate whether the monitor
   // thread should dump. Making it static so that it is accessiable from all the
@@ -1188,11 +1188,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
-  std::atomic<bool> enableTiming_;
+  std::atomic<bool> enableTiming_{};
 
   // Flag to enable the print of hash value of input/output of collectives for
   // verification.
-  std::atomic<bool> enableCollecticeHashDebug_;
+  std::atomic<bool> enableCollecticeHashDebug_{};
 
   // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
   bool avoidRecordStreams_ = false;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 0b28f7c183837..b1cebfe0502be 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -10,6 +10,7 @@
 #include <torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp>
 #include <torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp>
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
+#include <utility>
 #include <vector>
 #ifndef _WIN32
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
@@ -106,6 +107,7 @@ class IntrusivePtrNoGilDestructor {
   // This ctor is very important; see
   // https://github.com/pybind/pybind11/issues/2957
   explicit IntrusivePtrNoGilDestructor(T* impl)
+      // NOLINTNEXTLINE(bugprone-exception-escape)
       : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
   ~IntrusivePtrNoGilDestructor() {
     if (impl_) {
@@ -908,7 +910,7 @@ This class does not support ``__members__`` property.)");
   module.def(
       "_register_process_group",
       [](const std::string& group_name,
-         c10::intrusive_ptr<::c10d::ProcessGroup> group) {
+         const c10::intrusive_ptr<::c10d::ProcessGroup>& group) {
         ::c10d::register_process_group(group_name, group);
       },
       py::arg("group_name"),
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
index da995d80ca0f2..4c31149de44c1 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -72,7 +72,7 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
    * Members initialized after rendezvous
    */
   bool isInitialized_ = false;
-  int deviceIdx_;
+  int deviceIdx_{0};
   Topology topology_ = Topology::UNKNOWN;
   void* symmetricMemoryPtr_ = nullptr;
   c10::intrusive_ptr<SymmetricMemory> symmetricMemory_ = nullptr;
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 48f8786842f01..a43e428e899e0 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -61,7 +61,7 @@ Logger::Logger(std::shared_ptr<c10d::Reducer> reducer)
   ddp_logging_data_ = std::make_unique<at::DDPLoggingData>();
 }
 
-c10::once_flag log_graph_static_flag;
+static c10::once_flag log_graph_static_flag;
 
 void Logger::log_if_graph_static(bool is_static) {
   c10::call_once(log_graph_static_flag, [this, is_static]() {
@@ -116,7 +116,7 @@ void Logger::set_env_variables() {
 void Logger::set_parameter_stats() {
   // The number of parameter tensors
   ddp_logging_data_->ints_map["num_parameter_tensors"] =
-      reducer_->params_.size();
+      static_cast<int64_t>(reducer_->params_.size());
   // Total parameters size (Bytes)
   ddp_logging_data_->ints_map["total_parameter_size_bytes"] = 0;
   // Parameters' data types, there may be multiple data
diff --git a/torch/csrc/distributed/c10d/python_comm_hook.cpp b/torch/csrc/distributed/c10d/python_comm_hook.cpp
index c5b24e01fb515..adf73452bd7b4 100644
--- a/torch/csrc/distributed/c10d/python_comm_hook.cpp
+++ b/torch/csrc/distributed/c10d/python_comm_hook.cpp
@@ -7,6 +7,7 @@
 
 namespace c10d {
 
+// NOLINTNEXTLINE(bugprone-exception-escape)
 PythonCommHook::~PythonCommHook() {
   py::gil_scoped_acquire ag;
   state_.dec_ref();
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 21b0c4acff19f..bf21bab37ce3f 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1044,11 +1044,11 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
 }
 
 void Reducer::install_futures(
-    c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futs) {
+    const c10::List<c10::intrusive_ptr<c10::ivalue::Future>>& futs) {
   // Append instead of overwrite so that this method can be called multiple
   // times in one iteration.
   if (!installed_futures_) {
-    installed_futures_ = std::move(futs);
+    installed_futures_ = futs;
   } else {
     installed_futures_->append(futs);
   }
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index aa3c40ae95bbf..e0f6b4570fa31 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -137,7 +137,8 @@ class TORCH_API Reducer {
   // Install futures that should be awaited at end of backwards. Currently these
   // are only used by user-defined custom buffer reduction hooks, but can be
   // generalized to any user-originating futures that need to be awaited.
-  void install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futs);
+  void install_futures(
+      const c10::List<c10::intrusive_ptr<c10::ivalue::Future>>& futs);
 
   // Returns true if we should rebuild buckets, else false. We only rebuild
   // buckets once after the first iteration and never rebuild them if
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index db76a1eb284e5..cad9630345cf5 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -206,6 +206,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
     // if we can't resolve the hostname, display the IP address
     if (addr->sa_family == AF_INET) {
       struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
+      // NOLINTNEXTLINE(*array*)
       char ip[INET_ADDRSTRLEN];
       if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
           nullptr) {
@@ -213,6 +214,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
       }
     } else if (addr->sa_family == AF_INET6) {
       struct sockaddr_in6* psai = (struct sockaddr_in6*)&addr;
+      // NOLINTNEXTLINE(*array*)
       char ip[INET6_ADDRSTRLEN];
       if (inet_ntop(
               addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
@@ -275,7 +277,7 @@ struct formatter<c10d::detail::SocketImpl> {
     addr.ai_addr = addr_ptr;
     addr.ai_addrlen = addr_len;
 
-    auto remote = socket.remote();
+    auto const& remote = socket.remote();
     std::string remoteStr = remote ? *remote : "none";
 
     return fmt::format_to(
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index 05b8dec259c40..ab4ef317d6b6a 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -16,7 +16,7 @@ std::unordered_map<std::string, worker_id_t> collectNames(
   std::unordered_map<std::string, worker_id_t> nameToId;
   nameToId.reserve(worldSize);
   nameToId.emplace(selfName, selfId);
-  // NOLINTNEXTLINE(bugprone-too-small-loop-variable)
+  // NOLINTNEXTLINE(*loop*)
   for (worker_id_t workerId = 0; workerId < worldSize; ++workerId) {
     if (workerId == selfId) {
       continue;

From 86d4b7d60b264cae5a04a1b20719bcd7a5752a4c Mon Sep 17 00:00:00 2001
From: Xuehai Pan <XuehaiPan@pku.edu.cn>
Date: Thu, 24 Oct 2024 23:52:49 +0800
Subject: [PATCH 055/161] [FX][export][dynamo] use `tuple` instead of `list` in
 normalized `args_spec` (#138212)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138212
Approved by: https://github.com/jansel
---
 torch/_inductor/pattern_matcher.py | 29 ++++++++++++++---------
 torch/onnx/_internal/io_adapter.py | 38 +++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 36e8765759be9..061ddcb7c6c83 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -582,18 +582,25 @@ def simple_flatten(
     def pytree_flatten(
         args: Sequence[Any], kwargs: Mapping[Any, Any]
     ) -> Tuple[Sequence[Any], Union[_SimpleSpec, pytree.TreeSpec]]:
-        def norm_spec(s: pytree.TreeSpec) -> pytree.TreeSpec:
-            if s.type is None:
-                return s
-            mapping = {immutable_list: list, tuple: list, immutable_dict: dict}
-            return pytree.TreeSpec(
-                mapping.get(s.type, s.type),
-                s.context,
-                list(map(norm_spec, s.children_specs)),
-            )
+        type_mapping = {immutable_list: tuple, list: tuple, immutable_dict: dict}
+
+        def convert_type(x: Any) -> Any:
+            cls = type(x)
+            convert_fn = type_mapping.get(cls)
+            if convert_fn is not None:
+                return pytree.tree_map(
+                    convert_type,
+                    convert_fn(x),
+                    is_leaf=lambda x: type(x) in type_mapping,
+                )
+            return x
 
-        flat, spec = pytree.tree_flatten([args, kwargs])
-        spec = norm_spec(spec)
+        normalized_args_tree = pytree.tree_map(
+            convert_type,
+            (args, kwargs),
+            is_leaf=lambda x: type(x) in type_mapping,
+        )
+        flat, spec = pytree.tree_flatten(normalized_args_tree)
         return flat, spec
 
     def __repr__(self) -> str:
diff --git a/torch/onnx/_internal/io_adapter.py b/torch/onnx/_internal/io_adapter.py
index 16c1313a2d5a2..7334c79620de4 100644
--- a/torch/onnx/_internal/io_adapter.py
+++ b/torch/onnx/_internal/io_adapter.py
@@ -136,15 +136,35 @@ def apply(
 # TODO: make_fx lose stack info https://github.com/pytorch/pytorch/issues/90276
 
 
-def _replace_tuple_with_list(spec: pytree.TreeSpec) -> pytree.TreeSpec:
-    _type = list if spec.type == tuple else spec.type
-    return pytree.TreeSpec(
-        _type, spec.context, list(map(_replace_tuple_with_list, spec.children_specs))
+# TODO(XuehaiPan): Dynamo does not support `dummy_leaf = object()` as a sentinel value in the frame.
+class _DummyLeaf:  # use a class instead.
+    pass
+
+
+def _replace_list_with_tuple(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    def replace_list_with_tuple(x: Any) -> Any:
+        if type(x) is list:
+            return pytree.tree_map(
+                replace_list_with_tuple,
+                tuple(x),
+                is_leaf=lambda x: type(x) is list,
+            )
+        return x
+
+    dummy_leaf = _DummyLeaf()
+    dummy_tree = pytree.tree_unflatten([dummy_leaf] * spec.num_leaves, spec)
+    dummy_tree = pytree.tree_map(
+        replace_list_with_tuple,
+        dummy_tree,
+        is_leaf=lambda x: type(x) is list,
     )
+    return pytree.tree_structure(dummy_tree)
 
 
-def _open_top_level_list_if_single_element(spec: pytree.TreeSpec) -> pytree.TreeSpec:
-    if spec.type == list and spec.num_children == 1:
+def _open_top_level_sequence_if_single_element(
+    spec: pytree.TreeSpec,
+) -> pytree.TreeSpec:
+    if spec.type in (tuple, list) and spec.num_children == 1:
         return spec.children_specs[0]
     return spec
 
@@ -167,10 +187,10 @@ def _assert_identical_pytree_spec(
     pass_if_any_checks: Sequence[Callable[[], bool]] = [
         lambda: spec1 == spec2,
         # FIXME: Bug in `dynamo.export`. Sometimes outputs returned in 'list' instead of 'tuple'.
-        lambda: _replace_tuple_with_list(spec1) == _replace_tuple_with_list(spec2),
+        lambda: _replace_list_with_tuple(spec1) == _replace_list_with_tuple(spec2),
         # FIXME: Bug in `dynamo.export`. Sometimes single function return is wrapped in list.
-        lambda: _open_top_level_list_if_single_element(spec1) == spec2,
-        lambda: spec1 == _open_top_level_list_if_single_element(spec2),
+        lambda: _open_top_level_sequence_if_single_element(spec1) == spec2,
+        lambda: spec1 == _open_top_level_sequence_if_single_element(spec2),
     ]
 
     if not any(check() for check in pass_if_any_checks):

From 22d2e2d9a031f7ae5823be0d00896ef6541d3cf7 Mon Sep 17 00:00:00 2001
From: Mwiza Kunda <mwizak@graphcore.ai>
Date: Fri, 25 Oct 2024 09:38:08 +0000
Subject: [PATCH 056/161] Set RUNPATH so installed tests can find the required
 shared libraries (#136627)

This change fixes the RUNPATH of installed c++ tests so that the linker can find the shared libraries they depend on.

For example, currently:
```bash
venv/lib/python3.10/site-packages/torch $ ./bin/test_lazy
./bin/test_lazy: error while loading shared libraries: libtorch.so: cannot open shared object file: No such file or directory
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136627
Approved by: https://github.com/malfet
---
 c10/benchmark/CMakeLists.txt          |  1 +
 c10/test/CMakeLists.txt               |  1 +
 caffe2/CMakeLists.txt                 |  6 +++
 test/cpp/api/CMakeLists.txt           |  1 +
 test/cpp/c10d/CMakeLists.txt          | 53 +++++++++++++--------------
 test/cpp/dist_autograd/CMakeLists.txt |  1 +
 test/cpp/jit/CMakeLists.txt           |  1 +
 test/cpp/lazy/CMakeLists.txt          |  1 +
 test/cpp/rpc/CMakeLists.txt           |  1 +
 test/cpp/tensorexpr/CMakeLists.txt    |  2 +
 test/edge/CMakeLists.txt              |  1 +
 11 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/c10/benchmark/CMakeLists.txt b/c10/benchmark/CMakeLists.txt
index 16b268e3800a0..8dee635d7e1d7 100644
--- a/c10/benchmark/CMakeLists.txt
+++ b/c10/benchmark/CMakeLists.txt
@@ -8,6 +8,7 @@ if(BUILD_TEST)
     add_executable(${bench_name} "${bench_src}")
     target_link_libraries(${bench_name} ${C10_LIB} benchmark)
     if(INSTALL_TEST)
+      set_target_properties(${bench_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
       install(TARGETS ${bench_name} DESTINATION test)
     endif()
   endforeach()
diff --git a/c10/test/CMakeLists.txt b/c10/test/CMakeLists.txt
index 7f2a61246c6c6..83b5b17f9c8a6 100644
--- a/c10/test/CMakeLists.txt
+++ b/c10/test/CMakeLists.txt
@@ -12,6 +12,7 @@ if(BUILD_TEST)
     target_link_libraries(${test_name} ${C10_LIB} gmock gtest gtest_main)
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if(INSTALL_TEST)
+      set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
       install(TARGETS ${test_name} DESTINATION test)
     endif()
   endforeach()
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 32ad2037febdc..d77a726b41e5e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1748,6 +1748,7 @@ if(BUILD_TEST)
     target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
     add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
     if(INSTALL_TEST)
+        set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
       install(TARGETS ${test_name} DESTINATION test)
       # Install PDB files for MSVC builds
       if(MSVC AND BUILD_SHARED_LIBS)
@@ -1768,6 +1769,7 @@ if(BUILD_TEST)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if(INSTALL_TEST)
+        set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
         install(TARGETS ${test_name} DESTINATION test)
         # Install PDB files for MSVC builds
         if(MSVC AND BUILD_SHARED_LIBS)
@@ -1789,6 +1791,7 @@ if(BUILD_TEST)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if(INSTALL_TEST)
+      set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
         install(TARGETS ${test_name} DESTINATION test)
         # Install PDB files for MSVC builds
         if(MSVC AND BUILD_SHARED_LIBS)
@@ -1810,6 +1813,7 @@ if(BUILD_TEST)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if(INSTALL_TEST)
+        set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
         install(TARGETS ${test_name} DESTINATION test)
       endif()
     endforeach()
@@ -1824,6 +1828,7 @@ if(BUILD_TEST)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if(INSTALL_TEST)
+        set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
         install(TARGETS ${test_name} DESTINATION test)
         # Install PDB files for MSVC builds
         if(MSVC AND BUILD_SHARED_LIBS)
@@ -1843,6 +1848,7 @@ if(BUILD_TEST)
       target_compile_options(${test_name} PRIVATE ${HIP_CXX_FLAGS})
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       if(INSTALL_TEST)
+        set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
         install(TARGETS ${test_name} DESTINATION test)
       endif()
     endforeach()
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 61cf3a9be5ecd..fe34bf6a5021f 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -73,6 +73,7 @@ if(NOT MSVC)
 endif()
 
 if(INSTALL_TEST)
+  set_target_properties(test_api PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_api DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/cpp/c10d/CMakeLists.txt b/test/cpp/c10d/CMakeLists.txt
index 0874852517e33..fdcc20c5bc753 100644
--- a/test/cpp/c10d/CMakeLists.txt
+++ b/test/cpp/c10d/CMakeLists.txt
@@ -6,37 +6,40 @@ if(USE_CUDA)
 endif()
 
 function(c10d_add_test test_src)
+  set(prefix ARG)
+  set(noValues)
+  set(singleValues INSTALL_TEST)
+  set(multiValues LINK_LIBRARIES)
+
+  include(CMakeParseArguments)
+  cmake_parse_arguments(${prefix} "${noValues}" "${singleValues}" "${multiValues}" ${ARGN})
+
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
   target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${TORCH_SRC_DIR}/csrc/distributed>)
-  target_link_libraries(${test_name} ${ARGN})
+  target_link_libraries(${test_name} ${ARG_LINK_LIBRARIES})
   if(NOT WIN32)
     target_link_libraries(${test_name} pthread)
   endif()
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+
+  if(ARG_INSTALL_TEST)
+    set_target_properties(${test_name} PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
+    install(TARGETS ${test_name} DESTINATION bin)
+  endif()
 endfunction()
 
-c10d_add_test(BackoffTest.cpp torch_cpu gtest_main)
-c10d_add_test(FileStoreTest.cpp torch_cpu gtest_main)
-c10d_add_test(TCPStoreTest.cpp torch_cpu gtest_main)
-if(INSTALL_TEST)
-  install(TARGETS FileStoreTest DESTINATION bin)
-  install(TARGETS TCPStoreTest DESTINATION bin)
-endif()
+c10d_add_test(BackoffTest.cpp LINK_LIBRARIES torch_cpu gtest_main INSTALL_TEST OFF)
+c10d_add_test(FileStoreTest.cpp LINK_LIBRARIES torch_cpu gtest_main INSTALL_TEST ${INSTALL_TEST})
+c10d_add_test(TCPStoreTest.cpp LINK_LIBRARIES torch_cpu gtest_main INSTALL_TEST ${INSTALL_TEST})
 if(NOT WIN32)
-  c10d_add_test(HashStoreTest.cpp torch_cpu gtest_main)
-  if(INSTALL_TEST)
-    install(TARGETS HashStoreTest DESTINATION bin)
-  endif()
+  c10d_add_test(HashStoreTest.cpp LINK_LIBRARIES torch_cpu gtest_main INSTALL_TEST ${INSTALL_TEST})
 endif()
 
 if(USE_CUDA)
   if(USE_GLOO AND USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu c10d_cuda_test gtest_main)
-    if(INSTALL_TEST)
-      install(TARGETS ProcessGroupGlooTest DESTINATION bin)
-    endif()
-    c10d_add_test(ProcessGroupGlooAsyncTest.cpp torch_cpu c10d_cuda_test gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp LINK_LIBRARIES torch_cpu c10d_cuda_test gtest_main INSTALL_TEST ${INSTALL_TEST})
+    c10d_add_test(ProcessGroupGlooAsyncTest.cpp LINK_LIBRARIES torch_cpu c10d_cuda_test gtest_main INSTALL_TEST ${INSTALL_TEST})
   endif()
   if(USE_NCCL AND USE_C10D_NCCL)
     # NCCL is a private dependency of libtorch, but the tests include some
@@ -45,13 +48,11 @@ if(USE_CUDA)
     # a private dependency of the tests as well.
     c10d_add_test(
       ProcessGroupNCCLTest.cpp
-      torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
+      LINK_LIBRARIES torch_cpu c10d_cuda_test gtest_main __caffe2_nccl INSTALL_TEST ${INSTALL_TEST})
     c10d_add_test(
       ProcessGroupNCCLErrorsTest.cpp
-      torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
+      LINK_LIBRARIES torch_cpu c10d_cuda_test gtest_main __caffe2_nccl INSTALL_TEST ${INSTALL_TEST})
     if(INSTALL_TEST)
-      install(TARGETS ProcessGroupNCCLTest DESTINATION bin)
-      install(TARGETS ProcessGroupNCCLErrorsTest DESTINATION bin)
       install(TARGETS c10d_cuda_test DESTINATION lib)
     endif()
   endif()
@@ -62,15 +63,14 @@ if(USE_CUDA)
     # a private dependency of the tests as well.
     c10d_add_test(
       ProcessGroupUCCTest.cpp
-      torch_cpu c10d_cuda_test gtest_main __caffe2_ucc)
+      LINK_LIBRARIES torch_cpu c10d_cuda_test gtest_main __caffe2_ucc INSTALL_TEST ${INSTALL_TEST})
     if(INSTALL_TEST)
-      install(TARGETS ProcessGroupUCCTest DESTINATION bin)
       install(TARGETS c10d_cuda_test DESTINATION lib)
     endif()
   endif()
 else()
   if(USE_GLOO AND USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp LINK_LIBRARIES torch_cpu gtest_main INSTALL_TEST OFF)
   endif()
 endif()
 
@@ -80,10 +80,7 @@ if(USE_MPI AND USE_C10D_MPI)
   # private headers of libtorch, which in turn include MPI. As a hacky
   # alternative to making MPI a public dependency of libtorch, we make it
   # a private dependency of the tests as well.
-  c10d_add_test(ProcessGroupMPITest.cpp torch_cpu MPI::MPI_CXX)
-  if(INSTALL_TEST)
-    install(TARGETS ProcessGroupMPITest DESTINATION bin)
-  endif()
+  c10d_add_test(ProcessGroupMPITest.cpp LINK_LIBRARIES torch_cpu MPI::MPI_CXX INSTALL_TEST ${INSTALL_TEST})
 endif()
 
 if(LINUX AND USE_GLOO AND USE_C10D_GLOO)
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 0ae6e3bef1410..6b5bba4b82086 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -14,6 +14,7 @@ if(USE_DISTRIBUTED AND NOT WIN32)
   endif()
 
   if(INSTALL_TEST)
+    set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
     install(TARGETS test_dist_autograd DESTINATION bin)
     # Install PDB files for MSVC builds
     if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 9f0299b166c0f..cd2eaf761dffd 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -151,6 +151,7 @@ elseif(USE_ROCM)
 endif()
 
 if(INSTALL_TEST)
+  set_target_properties(test_jit PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_jit DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/cpp/lazy/CMakeLists.txt b/test/cpp/lazy/CMakeLists.txt
index 453c2b02083ba..9542343ff7816 100644
--- a/test/cpp/lazy/CMakeLists.txt
+++ b/test/cpp/lazy/CMakeLists.txt
@@ -44,6 +44,7 @@ elseif(USE_ROCM)
 endif()
 
 if(INSTALL_TEST)
+  set_target_properties(test_lazy PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_lazy DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/cpp/rpc/CMakeLists.txt b/test/cpp/rpc/CMakeLists.txt
index 6834b428ff937..5c3a0dc020de9 100644
--- a/test/cpp/rpc/CMakeLists.txt
+++ b/test/cpp/rpc/CMakeLists.txt
@@ -37,6 +37,7 @@ if(USE_CUDA)
 endif()
 
 if(INSTALL_TEST)
+  set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_cpp_rpc DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index 59a36fddbe7e2..9c409e078d9dd 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -71,7 +71,9 @@ elseif(USE_ROCM)
 endif()
 
 if(INSTALL_TEST)
+  set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_tensorexpr DESTINATION bin)
+  set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS tutorial_tensorexpr DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
diff --git a/test/edge/CMakeLists.txt b/test/edge/CMakeLists.txt
index 50579c9109dc8..72c01a2d36492 100644
--- a/test/edge/CMakeLists.txt
+++ b/test/edge/CMakeLists.txt
@@ -73,5 +73,6 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
           )
 endif()
 if(INSTALL_TEST)
+  set_target_properties(test_edge_op_registration PROPERTIES INSTALL_RPATH "${CMAKE_INSTALL_RPATH}:${_rpath_portable_origin}/../lib")
   install(TARGETS test_edge_op_registration DESTINATION bin)
 endif()

From a1175e34375c542d7f8719f2eb6410af9c8ef0df Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Wed, 23 Oct 2024 21:57:32 -0400
Subject: [PATCH 057/161] [BE] Strides are always non-negative, remove
 pointless test (#138784)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138784
Approved by: https://github.com/Chillee
---
 torch/fx/experimental/symbolic_shapes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 128a5fbcf039b..d5503ba25acb3 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -3777,7 +3777,7 @@ def _create_symbolic_sizes_strides_storage_offset(
             candidates = {
                 ex_size[i] * ex_stride[i]: size[i] * stride[i]
                 for i in range(len(size))
-                if stride[i] is not None and ex_stride[i] >= 0
+                if stride[i] is not None
             }
 
             # iterate over unbound strides in sorted order

From 9c35e33d9b02e384f0d504f942a916e9e849b163 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Thu, 24 Oct 2024 13:38:32 -0700
Subject: [PATCH 058/161] [c10d][CI] Improve world size setting in some tests
 (#138846)

Following change in #137161 , bumping world size for some test suites.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138846
Approved by: https://github.com/fduwjj
---
 test/distributed/test_c10d_object_collectives.py | 7 ++++---
 test/distributed/test_c10d_ops_nccl.py           | 9 ++++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index ece50ebe8890b..dcd6de797e725 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -24,7 +24,6 @@
     sys.exit(0)
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
-WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
 
 
 def with_comms(func=None):
@@ -54,14 +53,16 @@ def setUp(self):
     @property
     def device(self):
         return (
-            torch.device(self.rank)
+            torch.device("cuda", self.rank % torch.cuda.device_count())
             if BACKEND == dist.Backend.NCCL
             else torch.device("cpu")
         )
 
     @property
     def world_size(self):
-        return WORLD_SIZE
+        if BACKEND == dist.Backend.NCCL:
+            return torch.cuda.device_count()
+        return super().world_size
 
     @property
     def process_group(self):
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index c9fb0f30b53f9..1b5c4d98f1481 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -28,6 +28,7 @@
     init_multigpu_helper,
     MultiProcContinousTest,
     requires_nccl,
+    TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
     skip_but_pass_in_sandcastle_if,
@@ -979,8 +980,14 @@ def allgather_base(output_t, input_t):
 
 
 if __name__ == "__main__":
+    if not torch.cuda.is_available():
+        sys.exit(TEST_SKIPS["no_cuda"].exit_code)
+
     rank = int(os.getenv("RANK", -1))
-    world_size = int(os.getenv("WORLD_SIZE", 2))
+    world_size = int(os.getenv("WORLD_SIZE", -1))
+
+    if world_size == -1:  # Not set by external launcher
+        world_size = torch.cuda.device_count()
 
     if rank != -1:
         # Launched with torchrun or other multi-proc launchers. Directly run the test.

From 12755f45ff36c3bd4015661c5780caf1e4062f0d Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Thu, 24 Oct 2024 08:32:12 -0700
Subject: [PATCH 059/161] [Pipelining] small comments and variable renames
 (#138735)

Addressing the comments in previous PRs to update the variable names and add additional code comments

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138735
Approved by: https://github.com/wconstab
ghstack dependencies: #138119, #138504
---
 test/distributed/pipelining/test_backward.py |  6 ++---
 torch/distributed/pipelining/_backward.py    | 23 +++++++++++++-------
 torch/distributed/pipelining/stage.py        |  2 ++
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/test/distributed/pipelining/test_backward.py b/test/distributed/pipelining/test_backward.py
index ff2f27c1c0ccf..a19092d8a211d 100644
--- a/test/distributed/pipelining/test_backward.py
+++ b/test/distributed/pipelining/test_backward.py
@@ -75,7 +75,7 @@ def test_stage_backward_input(self):
         out = mod(x)
         loss = loss_fn(out, target)
         dinputs, param_groups = stage_backward_input(
-            stage_outputs=(loss,),
+            stage_outputs_or_loss=(loss,),
             output_grads=None,
             input_values=[x],
             weights=mod.parameters(),
@@ -110,7 +110,7 @@ def test_stage_backward_weight(self):
         out = mod(x)
         loss = loss_fn(out, target)
         dinputs, param_groups = stage_backward_input(
-            stage_outputs=(loss,),
+            stage_outputs_or_loss=(loss,),
             output_grads=None,
             input_values=[x],
             weights=mod.parameters(),
@@ -158,7 +158,7 @@ def test_stage_backward_weight_multiple_iters(self):
             out = mod(x)
             loss = loss_fn(out, target)
             dinputs, param_groups = stage_backward_input(
-                stage_outputs=(loss,),
+                stage_outputs_or_loss=(loss,),
                 output_grads=None,
                 input_values=[x],
                 weights=mod.parameters(),
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index f49d8fdbfb5e2..fe32952e9bca3 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -140,16 +140,23 @@ def get_param_groups(
 
 
 def stage_backward_input(
-    stage_outputs: List[torch.Tensor],
+    stage_outputs_or_loss: List[torch.Tensor],
     output_grads: Optional[List[torch.Tensor]],
     input_values: List[torch.Tensor],
     weights: Iterator[Parameter],
 ):
     """
-    compute the gradients for only the stage inputs with respect to the stage outputs
+    Compute the gradients for only the stage inputs with
+    respect to the stage outputs (if non-last stage) or loss (if last stage)
+
+    After computing input gradients, we save the intermediate nodes in `param_groups`
+    for later use in stage_backward_weight. We don't need to save any other intermediate nodes
+    that aren't needed for dW because when we do dW calculation, we start from saved intermediates.
+    Detaching the stage_outputs_or_loss at the end of this function is important as
+    it frees up the memory that the autograd graph is anticipating to be used later (but doesn't actually need).
     """
     stage_output_grad_fns: List[Node] = list(
-        filter(None, map(_get_grad_fn_or_grad_acc, stage_outputs))
+        filter(None, map(_get_grad_fn_or_grad_acc, stage_outputs_or_loss))
     )
     stage_input_grad_fns: List[Node] = list(
         filter(None, map(_get_grad_fn_or_grad_acc, input_values))
@@ -185,11 +192,11 @@ def hook(grad_inputs):
         if output_grads is None:
             # In case this is the loss and there are no output_grads, then we just use 1s
             output_grads = [
-                torch.ones_like(stage_output) for stage_output in stage_outputs
+                torch.ones_like(stage_output) for stage_output in stage_outputs_or_loss
             ]
 
         dinputs = torch.autograd.grad(
-            stage_outputs,
+            stage_outputs_or_loss,
             inputs=input_values,
             grad_outputs=output_grads,
             retain_graph=True,
@@ -202,9 +209,9 @@ def hook(grad_inputs):
             else:
                 inp.grad += dinputs[i]
 
-        # stage_outputs are not used in backwards after this point, so we can safely remove it from the autograd graph
-        # this allows autograd to clear up the graph dedicated for this output and free up significant memory
-        for t in stage_outputs:
+        # stage_outputs_or_loss are not used in backwards after this point, so we can safely remove it from the autograd graph
+        # this allows autograd to clear up the graph dedicated for this tensor and free up significant memory
+        for t in stage_outputs_or_loss:
             t.detach_()
 
     else:
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index 9c47f68e20c07..7ea111c92e969 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -697,6 +697,8 @@ def backward_one_chunk(
                 self.dw_runner[bwd_chunk_id] = lambda: None
 
         if self.is_last:
+            # Autograd dependencies:
+            #    rest_of_autograd_graph -> stage_output -> loss
             # stage_output is no longer used in the last stage for backward and only needed
             # to return to the user in merge_output_chunks, therefore
             # this should be detached to release autograd graph context and free memory earlier

From 2c82f73647a3928f55a531948e3d1927e492ba3f Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Thu, 24 Oct 2024 08:32:12 -0700
Subject: [PATCH 060/161] [Pipelining] Clean up hooks in zero bubble (#138720)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138720
Approved by: https://github.com/wconstab
ghstack dependencies: #138119, #138504, #138735
---
 torch/distributed/pipelining/_backward.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index fe32952e9bca3..046c1fd1fe130 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -170,6 +170,7 @@ def stage_backward_input(
         stage_input_grad_fns, weight_grad_fns, reverse_edges_dict
     )
 
+    handles = []
     for param_group in param_groups:
         for i, intermediate in enumerate(param_group["intermediates"]):
 
@@ -185,7 +186,8 @@ def hook(grad_inputs):
 
             # These are always "split" nodes that we need to recompute, so
             # save their inputs.
-            intermediate.register_prehook(get_hook(param_group, i))
+            handle = intermediate.register_prehook(get_hook(param_group, i))
+            handles.append(handle)
 
     # Stage 0 inputs do not require grads? Should we skip in that case?
     if all(tensor.requires_grad for tensor in input_values):
@@ -217,6 +219,10 @@ def hook(grad_inputs):
     else:
         dinputs = None
 
+    # hooks are no longer necessary, clean up for consistency
+    for handle in handles:
+        handle.remove()
+
     return dinputs, param_groups
 
 
@@ -255,6 +261,9 @@ def stage_backward_weight(
             grad_outputs=sum(param_group["grads"], tuple()),
             retain_graph=retain_graph,
         )
+        # release grad memory early after use
+        del param_group["grads"]
+
         for grad_acc, dw in zip(param_group["params"], dweights):
             weight, index = grad_acc_to_weight[grad_acc]
             if weight.grad is None:

From 245026af2d2f26c74993cb90e01bddbd627c6797 Mon Sep 17 00:00:00 2001
From: IvanKobzarev <ivan.kobzarev@gmail.com>
Date: Thu, 24 Oct 2024 13:19:25 -0700
Subject: [PATCH 061/161] [aotd] Unwrap unseen AsyncCollectiveTensor tangents
 (#138731)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138731
Approved by: https://github.com/bdhirsh
---
 test/functorch/test_aotdispatch.py            | 30 +++++++++++++++++++
 .../_aot_autograd/runtime_wrappers.py         | 15 +++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 3e1eeb8255b75..b1689e5afb2ef 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -45,6 +45,7 @@
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.codecache import compiled_fx_graph_hash
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.fx.experimental.proxy_tensor import is_sym_node
 from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode, ShapeEnv
 from torch.nn.utils.rnn import PackedSequence
@@ -6184,6 +6185,35 @@ def fn(x):
         out_buffer = out.values()
         ga, gb, gc = torch.autograd.grad(out_buffer.sum(), (a, b, c))
 
+    def test_unwrap_async_collective_tensor_tangent(self):
+        def fn(x):
+            return x.clone()
+
+        ref_x = TwoTensor(
+            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
+        )
+        ref_y = fn(ref_x)
+        ref_y.backward(gradient=TwoTensor(torch.randn(2, 3), torch.randn(2, 3)))
+
+        fn_comp = torch.compile(fn, fullgraph=True)
+
+        x = TwoTensor(
+            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
+        )
+        y = fn_comp(x)
+        y.backward(gradient=TwoTensor(torch.randn(2, 3), torch.randn(2, 3)))
+
+        x2 = TwoTensor(
+            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
+        )
+        y2 = fn_comp(x2)
+        y2.backward(
+            gradient=TwoTensor(
+                AsyncCollectiveTensor(torch.randn(2, 3)),
+                AsyncCollectiveTensor(torch.randn(2, 3)),
+            )
+        )
+
     @torch._inductor.config.patch({"freezing": True})
     def test_inductor_freezing_with_subclasses(self):
         class M(torch.nn.Module):
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 9e8a21321ad77..7ab1d41a30cc7 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -27,6 +27,7 @@
 )
 from torch._prims_common import CUDARngStateHelper
 from torch._subclasses import FakeTensor
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.fx.experimental._backward_state import BackwardState
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
@@ -1443,7 +1444,19 @@ def coerce_runtime_tangent_tracing_memory_format(x, memory_format):
             return x
 
         is_subclass: bool = is_traceable_wrapper_subclass(x)
-        mem_format = memory_format[0] if is_subclass else memory_format
+        mem_format = memory_format
+        if is_subclass:
+            memory_format_for_dense_tensor = not isinstance(memory_format, list)
+            if isinstance(x, AsyncCollectiveTensor) and memory_format_for_dense_tensor:
+                # This is AsyncCollectiveTensor, that we have not seen during tracing time.
+                while True:
+                    x = x.trigger_wait()
+                    # Checking recursive AsyncCollectiveTensor
+                    if not isinstance(x, AsyncCollectiveTensor):
+                        break
+                is_subclass = False
+            else:
+                mem_format = memory_format[0]
 
         if not x.is_contiguous(memory_format=mem_format):
             x = x.contiguous(memory_format=mem_format)

From 45b8155a078879f9a9639878ddfa264385390970 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 25 Oct 2024 13:42:37 +0000
Subject: [PATCH 062/161] [CI] Run periodic jobs only on pytorch/pytorch repo
 (#138874)

Github by default tries to not run periodic jobs on forks, see https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/disabling-and-enabling-a-workflow
But there is a special test repo called `pytorch/canary`, that will run those workflows for next 60 days, which is a waste of resources
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138874
Approved by: https://github.com/huydhn
---
 .github/workflows/inductor-cu124.yml                     | 1 +
 .github/workflows/inductor-micro-benchmark-x86.yml       | 1 +
 .github/workflows/inductor-micro-benchmark.yml           | 1 +
 .github/workflows/inductor-perf-compare.yml              | 1 +
 .github/workflows/inductor-perf-test-nightly-a10g.yml    | 1 +
 .github/workflows/inductor-perf-test-nightly-aarch64.yml | 1 +
 .github/workflows/inductor-perf-test-nightly-x86.yml     | 1 +
 .github/workflows/inductor-perf-test-nightly.yml         | 1 +
 .github/workflows/inductor-periodic.yml                  | 1 +
 .github/workflows/inductor-rocm.yml                      | 1 +
 .github/workflows/inductor.yml                           | 1 +
 .github/workflows/nightly.yml                            | 1 +
 .github/workflows/periodic.yml                           | 1 +
 .github/workflows/pull.yml                               | 1 +
 .github/workflows/rocm.yml                               | 1 +
 .github/workflows/slow.yml                               | 1 +
 .github/workflows/trunk.yml                              | 1 +
 17 files changed, 17 insertions(+)

diff --git a/.github/workflows/inductor-cu124.yml b/.github/workflows/inductor-cu124.yml
index bb96f16cb01ab..bddbc9c730af4 100644
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@@ -21,6 +21,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml
index d31dbc5951ea1..cbd9a5dace798 100644
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@@ -17,6 +17,7 @@ permissions: read-all
 
 jobs:
   linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     name: linux-jammy-cpu-py3.9-gcc11-inductor
     uses: ./.github/workflows/_linux-build.yml
     with:
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index 204cf167917b9..e8270abd469aa 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -19,6 +19,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 761eb77223c5a..0f459f42107fc 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -25,6 +25,7 @@ jobs:
   get-test-label-type:
     name: get-test-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-nightly-a10g.yml b/.github/workflows/inductor-perf-test-nightly-a10g.yml
index a15bff1d5c637..cd208bfde262d 100644
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@@ -71,6 +71,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index caf2eeff7ffd1..e51950ca74ad9 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -51,6 +51,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 997cf3cacc93c..0d9d79332f945 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -51,6 +51,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 1ef402cb4ca55..84a935e196a76 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -69,6 +69,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 9e605d5bb8cc9..2abc1e40a3699 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -21,6 +21,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml
index 7b97517ba5b61..faf386881734b 100644
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@@ -25,6 +25,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 56bf1328ca260..92e09623dbb51 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -21,6 +21,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index cd5c18853d54b..c806b525c6425 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -20,6 +20,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index f5e71bceb4d26..3711b4dd68cf2 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -41,6 +41,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 17b700f9eadf4..6171fc5c0aa35 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -38,6 +38,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml
index 051a5eb1a9b71..6aa01ff179eff 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@@ -26,6 +26,7 @@ jobs:
       contents: read
 
   linux-focal-rocm6_2-py3_10-build:
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     name: linux-focal-rocm6.2-py3.10
     uses: ./.github/workflows/_linux-build.yml
     with:
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 551eb76b6f4be..2aab56e971f81 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -39,6 +39,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 276cb9155b58a..655fb72b20e69 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -37,6 +37,7 @@ jobs:
   get-label-type:
     name: get-label-type
     uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}

From 78a0158540a669893245534b89af55a9de4cb082 Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Fri, 25 Oct 2024 13:55:41 +0000
Subject: [PATCH 063/161] [Dynamo] Improve `args` in `higher_order_ops` [1/N]
 (#138799)

Replaced hard-coded argument indices with meaningful variable names.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138799
Approved by: https://github.com/zou3519
---
 torch/_dynamo/variables/higher_order_ops.py | 67 ++++++++++++---------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 505e715145504..3af819db81dbd 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -679,38 +679,40 @@ def call_function(
             )
 
         # Specialize into one of the branches since pred is constant
-        if type(args[0]) is ConstantVariable:
+        pred, true_fn, false_fn, operands = args
+        if type(pred) is ConstantVariable:
             log.warning(
                 "Pred is a Python constant. When used with torch.cond, it executes only one of the branches."
                 " If you want torch.cond to perserve two branches, please make the predicate a boolean tensor or a SymBool."
             )
-            if args[0].as_python_constant():
-                return args[1].call_function(tx, args[3].unpack_var_sequence(tx), {})
+            if pred.as_python_constant():
+                return true_fn.call_function(tx, operands.unpack_var_sequence(tx), {})
             else:
-                return args[2].call_function(tx, args[3].unpack_var_sequence(tx), {})
+                return false_fn.call_function(tx, operands.unpack_var_sequence(tx), {})
 
         # predicate
-        if type(args[0]) not in (ConstantVariable, TensorVariable, SymNodeVariable):
+        if type(pred) not in (ConstantVariable, TensorVariable, SymNodeVariable):
             unimplemented(
                 f"Expected pred to be bool or a boolean tensor with single "
-                f"item but got {str(type(args[0]))} "
-                f"with original python type {str(args[0].python_type())}.",
+                f"item but got {str(type(pred))} "
+                f"with original python type {str(pred.python_type())}.",
             )
 
         # operands
-        if not isinstance(args[3], (ListVariable, TupleVariable)):
+        if not isinstance(operands, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected a tuple but got {args[3].python_type()}",
+                f"Expected operands to be a list/tuple but got "
+                f"{operands.python_type()}",
             )
-        operands = args[3].unpack_var_sequence(tx)
-        if not only_consist_of(args[3], (TensorVariable,)):
+        operands_seq = operands.unpack_var_sequence(tx)
+        if not only_consist_of(operands, (TensorVariable,)):
             unimplemented(
                 "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
             )
 
         # branches
-        _check_supported_callable_arg(tx, args[1], "true_fn")
-        _check_supported_callable_arg(tx, args[2], "false_fn")
+        _check_supported_callable_arg(tx, true_fn, "true_fn")
+        _check_supported_callable_arg(tx, false_fn, "false_fn")
 
         # Our strategy for tracing the true/false branches of cond
         # are to checkpoint our graphstate, run the true branch,
@@ -736,7 +738,7 @@ def speculate_branch(branch):
             ) = speculate_subgraph(
                 tx,
                 args[ix],
-                operands,
+                operands_seq,
                 {},
                 "cond",
                 source_target=self.value,
@@ -823,7 +825,7 @@ def diff_meta(tensor_vars1, tensor_vars2):
         false_node = make_attr(tx, false_name)
 
         p_args = (
-            args[0].as_proxy(),
+            pred.as_proxy(),
             true_node,
             false_node,
             # We pick true_shared but it shouldn't matter
@@ -909,26 +911,30 @@ def call_function(
                 f"Usage: while_loop(cond_fn, body_fn, operands)",
             )
 
-        _check_supported_callable_arg(tx, args[0], "cond_fn")
-        _check_supported_callable_arg(tx, args[1], "body_fn")
+        cond_fn, body_fn, operands, additional_inputs = args
+        _check_supported_callable_arg(tx, cond_fn, "cond_fn")
+        _check_supported_callable_arg(tx, body_fn, "body_fn")
 
         # operands
-        if not isinstance(args[2], (ListVariable, TupleVariable)):
+        if not isinstance(operands, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected a tuple but got {args[2].python_type()}",
+                f"Expected operands to be a list/tuple but got "
+                f"{operands.python_type()}",
             )
-        operands = args[2].unpack_var_sequence(tx)
-        if not only_consist_of(args[2], (TensorVariable,)):
+        operands_seq = operands.unpack_var_sequence(tx)
+        if not only_consist_of(operands, (TensorVariable,)):
             unimplemented(
                 "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
             )
 
         # additional inputs check
-        if not isinstance(args[3], (ListVariable, TupleVariable)):
+        if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
             unimplemented(
-                f"Expected a tuple but got {args[3].python_type()}",
+                f"Expected additional_inputs to be a list/tuple but got "
+                f"{additional_inputs.python_type()}. It seems to be an "
+                f"internal error, please report an issue to PyTorch."
             )
-        additional_inputs = args[3].unpack_var_sequence(tx)
+        additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
 
         (
             (cond_r, cond_treespec),
@@ -936,8 +942,8 @@ def call_function(
             cond_lifted_freevars,
         ) = speculate_subgraph(
             tx,
-            args[0],
-            operands + additional_inputs,
+            cond_fn,
+            operands_seq + additional_inputs_seq,
             {},
             "while_loop",
             source_target=self.value,
@@ -965,8 +971,8 @@ def call_function(
             body_lifted_freevars,
         ) = speculate_subgraph(
             tx,
-            args[1],
-            operands + additional_inputs,
+            body_fn,
+            operands_seq + additional_inputs_seq,
             {},
             "while_loop",
             source_target=self.value,
@@ -1012,9 +1018,10 @@ def call_function(
         p_args = (
             cond_node,
             body_node,
-            tuple([operand.as_proxy() for operand in operands]),
+            tuple([operand.as_proxy() for operand in operands_seq]),
             tuple(
-                [inp.as_proxy() for inp in additional_inputs] + additional_lifted_inputs
+                [inp.as_proxy() for inp in additional_inputs_seq]
+                + additional_lifted_inputs
             ),
         )
 

From 6cadf616aeb612f3c866b734268919ad1616ffaf Mon Sep 17 00:00:00 2001
From: Sam Larsen <slarsen@meta.com>
Date: Wed, 23 Oct 2024 19:24:41 -0700
Subject: [PATCH 064/161] [fx graph cache] FxGraphPickler: Remove hack to
 stabilize device string hashes (#138681)

Summary: With the fast pickling mode, we don't need the custom hack for replacing device strings in tensors. This was previously needed because, e.g., two strings "cuda" will pickle differently if they are the same object vs. not.

Test Plan:
The new test fails with fast mode commented out, but succeeds when enabled:
`python test/inductor/test_codecache.py -k test_stable_strings`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138681
Approved by: https://github.com/oulgen
---
 test/inductor/test_codecache.py               | 16 +++++++++
 .../_aot_autograd/autograd_cache.py           | 10 ++----
 torch/_inductor/codecache.py                  | 35 +++++--------------
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 70d1ae48f7cb0..b7b7f11ccea9b 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -835,6 +835,22 @@ def uuid(self) -> Optional[Union[bytes, str]]:
                 FxGraphCachePickler.dumps(details3),
             )
 
+    def test_stable_strings(self):
+        """
+        Test that objects containing identical strings pickle the same
+        even if they are not the same id.
+        """
+        s1 = "string"
+        s2 = "strin"
+        s2 += "g"
+
+        self.assertNotEqual(id(s1), id(s2))
+
+        self.assertEqual(
+            FxGraphCachePickler.dumps([s1, s1]),
+            FxGraphCachePickler.dumps([s1, s2]),
+        )
+
     def test_get_hash_for_files(self):
         """
         Test the get_hash_for_files helper.
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 9512e6561a438..aaf05634b343b 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -250,14 +250,8 @@ def _reduce_tensor(tensor):
     """
     Reduce the tensor to a stable key for caching.
     """
-    return (
-        _ident,
-        (
-            extract_tensor_metadata_for_cache_key(
-                FxGraphCachePickler._device_map, tensor
-            ),
-        ),
-    )
+    metadata = extract_tensor_metadata_for_cache_key(tensor)
+    return (_ident, (metadata,))
 
 
 class AOTAutogradCachePickler(FxGraphCachePickler):
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index c914c6a7338bd..618f7a5d10840 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -506,9 +506,7 @@ def _ident(x: T) -> T:
     return x
 
 
-def extract_tensor_metadata_for_cache_key(
-    device_map: Dict[torch.device, torch.device], t: Tensor
-) -> TensorMetadata:
+def extract_tensor_metadata_for_cache_key(t: Tensor) -> TensorMetadata:
     """
     Extracts the tensor metadata and removes fields of the TensorMetadata
     that are not needed for caching
@@ -517,32 +515,19 @@ def extract_tensor_metadata_for_cache_key(
     if not hasattr(t, "_is_inductor_static"):
         meta = dataclasses.replace(meta, storage_offset=0, storage_bytes=None)
 
-    # The pickle implementation avoids serializing the same object more than once.
-    # That behavior means the byte stream we create to hash will vary if, for example,
-    # we see two tensor objects with the same device, but the torch.device object is
-    # actually the same object vs. merely equivalent. We want to produce the same hash
-    # value in either situation, so we memoize the device objects and always reference
-    # the same object for a given device. It's possible other metadata fields deserve
-    # the same treatment, but so far we've only observed this issue with the device.
-    if meta.device not in device_map:
-        device_map[meta.device] = meta.device
-    meta = dataclasses.replace(meta, device=device_map[meta.device])
-
     return meta
 
 
-def _reduce_fake_tensor(
-    device_map: Dict[torch.device, torch.device], t: Tensor
-) -> Tuple[Callable[[T], T], Tuple[TensorMetadata]]:
+def _reduce_fake_tensor(t: Tensor) -> Tuple[Callable[[T], T], Tuple[TensorMetadata]]:
     """
     See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
     """
-    metadata = extract_tensor_metadata_for_cache_key(device_map, t)
+    metadata = extract_tensor_metadata_for_cache_key(t)
     return (_ident, (metadata,))
 
 
 def _reduce_tensor(
-    device_map: Dict[torch.device, torch.device], t: Tensor
+    t: Tensor,
 ) -> Tuple[Callable[[T], T], Tuple[TensorMetadataAndValues]]:
     """
     See FxGraphCachePickler. Custom reducer to pickle Tensors.
@@ -570,7 +555,7 @@ def _reduce_tensor(
             f"FX graph cache handling of a large constant took {elapsed:.1}s. Please file an issue."
         )
 
-    metadata = extract_tensor_metadata_for_cache_key(device_map, t)
+    metadata = extract_tensor_metadata_for_cache_key(t)
     return (_ident, (TensorMetadataAndValues(metadata, values),))
 
 
@@ -600,13 +585,9 @@ class FxGraphCachePickler(pickle.Pickler):
     data that allow us to compute a stable, but safe hash.
     """
 
-    # See extract_tensor_metadata_for_cache_key. Whenever we extract metadata during
-    # pickling, we make sure devices always reference the same torch.device object.
-    _device_map: Dict[torch.device, torch.device] = {}
-
     dispatch_table = copyreg.dispatch_table.copy()
-    dispatch_table[FakeTensor] = functools.partial(_reduce_fake_tensor, _device_map)
-    dispatch_table[torch.Tensor] = functools.partial(_reduce_tensor, _device_map)
+    dispatch_table[FakeTensor] = _reduce_fake_tensor
+    dispatch_table[torch.Tensor] = _reduce_tensor
     dispatch_table[torch.SymInt] = _reduce_symint
     dispatch_table[
         torch.fx.experimental._backward_state.BackwardState
@@ -648,7 +629,7 @@ def debug_lines(cls, inp: FxGraphHashDetails) -> List[str]:
 
         def get_str(obj: Any) -> str:
             if isinstance(obj, torch.Tensor):
-                return str(extract_tensor_metadata_for_cache_key(cls._device_map, obj))
+                return str(extract_tensor_metadata_for_cache_key(obj))
             elif isinstance(obj, bytes):
                 return "<bytes>"
             elif type(obj) in cls.dispatch_table:

From fe18a221eb35e603002b094ad8ceb353e2cdca56 Mon Sep 17 00:00:00 2001
From: eellison <elias.ellison@gmail.com>
Date: Thu, 24 Oct 2024 10:22:17 -0700
Subject: [PATCH 065/161] Add debug backend that applies CrossRefFakeMode, use
 in compiler bisector (#138651)

I was debugging an internal ne divergence for a while that ended up being because of a bad meta. I added an explicit a config option and an explicit backend `aot_eager_decomp_partition_crossref` to enable the FakeCrossRefMode when running the graph.  I added an explicit backend bc I suspect it will be useful for internal models but I'm also happy to leave as config option.

It will only test ops that have meta to avoid memory overhead of hitting fallback path and running in eager.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138651
Approved by: https://github.com/zou3519, https://github.com/bdhirsh
---
 test/dynamo/test_compiler_bisector.py | 64 +++++++++++++++++++++++++++
 torch/_dynamo/backends/debugging.py   | 32 +++++++++++++-
 torch/_functorch/config.py            |  4 ++
 torch/_inductor/bisect_helper.py      |  2 +
 torch/_subclasses/fake_utils.py       |  6 +++
 5 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index f89f935b18cce..a5671dd4a20a4 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -9,6 +9,7 @@
 from torch._dynamo.test_case import TestCase
 from torch._inductor import config
 from torch._inductor.bisect_helper import BisectionManager
+from torch.library import _scoped_library, Library
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -23,6 +24,23 @@
 
 @requires_cuda
 class TestCompilerBisector(TestCase):
+    test_ns = "_test_bisector"
+
+    def tearDown(self):
+        if hasattr(torch.ops, self.test_ns):
+            delattr(torch.ops, self.test_ns)
+        if hasattr(self, "lib"):
+            del self.lib.m
+            del self.lib
+
+    def get_op(self, name):
+        return getattr(getattr(torch.ops, self.test_ns), name).default
+
+    def get_lib(self):
+        lib = Library(self.test_ns, "FRAGMENT")  # noqa: TOR901
+        self.lib = lib
+        return lib
+
     def test_bad_decomp(self):
         mod = import_module("torch._inductor.compile_fx")
 
@@ -78,6 +96,52 @@ def test_fn():
         self.assertEqual(out.bisect_number, 1)
         self.assertTrue("aten.exponential" in out.debug_info)
 
+    def test_crossref(self):
+        test_ns = "bisect_ops"
+        with _scoped_library(self.test_ns, "FRAGMENT") as lib:
+            lib.define("foo(Tensor x) -> Tensor")
+            op = self.get_op("foo")
+
+            class Foo(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, x):
+                    # Emulate AutoDispatchBelowADInplaceOrView, which is not bound into python
+                    with torch._C._AutoDispatchBelowAutograd():
+                        with torch._C._ExcludeDispatchKeyGuard(
+                            torch._C.DispatchKeySet(
+                                torch._C.DispatchKey.ADInplaceOrView
+                            )
+                        ):
+                            return op(x)
+
+                @staticmethod
+                def backward(ctx, gx):
+                    return gx
+
+            def foo_impl(x):
+                return x.view_as(x).clone()
+
+            def foo_meta(x):
+                return x.view_as(x)
+
+            lib.impl("foo", Foo.apply, "Autograd")
+            lib.impl("foo", foo_impl, "CPU")
+            lib.impl("foo", foo_meta, "Meta")
+
+            x = torch.tensor(3.14159 / 3, requires_grad=True)
+
+            def test_fn():
+                torch._dynamo.reset()
+
+                try:
+                    torch.testing.assert_allclose(torch.compile(op)(x), op(x))
+                except Exception:
+                    return False
+                return True
+
+            out = BisectionManager.do_bisect(test_fn)
+            self.assertEqual(out.backend, "aot_eager_decomp_partition_crossref")
+
     def test_emulate_precision_casts(self):
         def test_fn():
             torch._dynamo.reset()
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index abd5111dbb1aa..94ed9b0865091 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -118,6 +118,23 @@ def run(args):
     return run
 
 
+def fake_crossref_boxed_nop(fx_g, example_inputs):
+    def run(args):
+        with torch._subclasses.CrossRefFakeMode():
+            return torch.fx.Interpreter(fx_g).boxed_run(args)
+
+    run._boxed_call = True
+    return run
+
+
+def get_nop_func():
+    return (
+        boxed_nop
+        if not torch._functorch.config.fake_tensor_crossref
+        else fake_crossref_boxed_nop
+    )
+
+
 # Useful for debugging purpose
 # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
 def aot_eager(
@@ -166,8 +183,8 @@ def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
     with functorch_config.patch(config_patches):
         return aot_autograd(
             # these are taken from memory_efficient_fusion()
-            fw_compiler=boxed_nop,
-            bw_compiler=boxed_nop,
+            fw_compiler=get_nop_func(),
+            bw_compiler=get_nop_func(),
             # NB: lambda here is to delay import of inductor
             decompositions=lambda: import_module(
                 "torch._inductor.compile_fx"
@@ -183,6 +200,17 @@ def aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs):
 )
 
 
+def aot_eager_decomp_partition_crossref(gm, fake_tensor_inputs, **kwargs):
+    with functorch_config.patch(fake_tensor_crossref=True):
+        return aot_eager_decomp_partition(gm, fake_tensor_inputs, **kwargs)
+
+
+register_backend(
+    name="aot_eager_decomp_partition_crossref",
+    compiler_fn=aot_eager_decomp_partition_crossref,
+)
+
+
 # AOT Autograd with torchscript backend. Default partitioner.
 # aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
 # by using the relevant fuser with torch.jit.fuser(...)
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 8c042ee7ed56a..9d148de1aa794 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -162,6 +162,10 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # tokens.
 unlift_effect_tokens = False
 
+
+# Run aot eager decomp partition with CrossRefFakeMode
+fake_tensor_crossref = False
+
 # This mode specifies that we should also keep track of the real
 # tensor along with the fake tensor, and do real compute.  While
 # seemingly this eliminates the whole point of fake tensors, there are
diff --git a/torch/_inductor/bisect_helper.py b/torch/_inductor/bisect_helper.py
index b072aea53e529..5cb1dd5691804 100644
--- a/torch/_inductor/bisect_helper.py
+++ b/torch/_inductor/bisect_helper.py
@@ -53,6 +53,8 @@ def __post_init__(self) -> None:
             "decomposition"
         ),  # number of decompositions we apply in tracing
     ],  # TODO - add cse ?
+    # applies CrossRefFakeMode on invocation
+    "aot_eager_decomp_partition_crossref": [],
     "inductor": [
         BisectSubsystem(
             "post_grad_passes"
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index 28fc7a4028917..c610ee9dbab40 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -82,6 +82,7 @@ def __init__(
         *,
         check_strides=True,
         check_aliasing=True,
+        only_check_ops_with_meta=True,
     ):
         super().__init__()
         self.ignore_op_fn = (
@@ -89,6 +90,7 @@ def __init__(
         )
         self.check_strides = check_strides
         self.check_aliasing = check_aliasing
+        self.only_check_ops_with_meta = only_check_ops_with_meta
 
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
@@ -105,6 +107,10 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 aten.set_.source_Storage_storage_offset,
             )
             and not self.ignore_op_fn(func)
+            and (
+                not self.only_check_ops_with_meta
+                or torch._subclasses.fake_impls.has_meta(func)
+            )
             and torch.Tag.dynamic_output_shape not in func.tags
             and torch.Tag.inplace_view not in func.tags
             and torch.Tag.data_dependent_output not in func.tags

From 817b4988e487639b7b282afcb7a564cf0d507911 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 24 Oct 2024 14:00:36 -0700
Subject: [PATCH 066/161] [dynamo][config-cleanup] Remove
 enable_cpp_guard_manager=False codepath (#138512)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138512
Approved by: https://github.com/williamwen42, https://github.com/jansel
---
 test/dynamo/test_frame_init.py    |   6 +-
 torch/_dynamo/config.py           |   4 +-
 torch/_dynamo/guards.py           | 676 ++++++++++--------------------
 torch/csrc/dynamo/cache_entry.cpp |   7 +-
 torch/csrc/dynamo/extra_state.cpp |  10 +-
 5 files changed, 239 insertions(+), 464 deletions(-)

diff --git a/test/dynamo/test_frame_init.py b/test/dynamo/test_frame_init.py
index 5abf6a45c7429..00206d52e3936 100644
--- a/test/dynamo/test_frame_init.py
+++ b/test/dynamo/test_frame_init.py
@@ -87,11 +87,13 @@ def test_frame_init(self):
             target_with_varkwargs.__code__: varkwargs_code2.__code__,
         }
 
+        empty_guard_manager = torch._dynamo.guards.GuardManager()
+
         def callback1(frame, cache_entry, frame_state):
             if frame.f_code in code_map1:
                 transformed_code = code_map1[frame.f_code]
                 return torch._dynamo.types.GuardedCode(
-                    transformed_code, lambda f_locals: True, CompileId(0, 0)
+                    transformed_code, empty_guard_manager, CompileId(0, 0)
                 )
             return None
 
@@ -99,7 +101,7 @@ def callback2(frame, cache_entry, frame_state):
             if frame.f_code in code_map2:
                 transformed_code = code_map2[frame.f_code]
                 return torch._dynamo.types.GuardedCode(
-                    transformed_code, lambda f_locals: True, CompileId(0, 0)
+                    transformed_code, empty_guard_manager, CompileId(0, 0)
                 )
             return None
 
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 7dae960758093..e974e4ccb852b 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -369,8 +369,8 @@ def _get_optimize_ddp_mode():
 # use numpy's PRNG if True, pytorch otherwise
 use_numpy_random_stream = False
 
-# Use C++ guard manager
-enable_cpp_guard_manager = os.environ.get("TORCHDYNAMO_CPP_GUARD_MANAGER", "1") == "1"
+# Use C++ guard manager (deprecated: always true)
+enable_cpp_guard_manager = True
 
 # Inline inbuilt nn modules
 inline_inbuilt_nn_modules = not is_fbcode()
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 186a4b6e27be7..717eb1499c799 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -13,7 +13,6 @@
 import itertools
 import logging
 import math
-import os
 import re
 import sys
 import textwrap
@@ -47,7 +46,6 @@
     install_no_tensor_aliasing_guard,
     install_object_aliasing_guard,
     RootGuardManager,
-    TensorGuards,
 )
 from torch._dynamo.source import (
     is_from_flatten_script_object_source,
@@ -528,7 +526,7 @@ def __init__(
         lookup_weakrefs: Callable[[object], ReferenceType[object]],
         local_scope: Dict[str, object],
         global_scope: Dict[str, object],
-        guard_manager: Optional[GuardManager],
+        guard_manager: GuardManager,
         check_fn_manager: CheckFunctionManager,
     ):
         self.id_ref = id_ref
@@ -828,7 +826,6 @@ def manager_guards_on_keys(self, mgr_enum):
         )
 
     def get_global_guard_manager(self):
-        assert self.guard_manager  # to make mypy happy
         return self.guard_manager.root.globals_dict_manager(
             f_globals=self.scope["G"],
             source="G",
@@ -837,7 +834,6 @@ def get_global_guard_manager(self):
         )
 
     def get_guard_manager_from_source(self, source):
-        assert self.guard_manager  # to make mypy happy
         root_guard_manager = self.guard_manager.root
 
         example_value = None
@@ -1162,7 +1158,6 @@ def add_python_lambda_leaf_guard_to_root(
         globals_for_guard_fn = {"G": self.scope["G"]}
         exec(pycode, globals_for_guard_fn, out)
         guard_fn = out["___make_guard_fn"](*closure_vars.values())
-        assert self.guard_manager  # to make mypy happy
         if is_epilogue:
             # Epilogue guards are run after all the other guards have finished.
             # If epilogue guards contain a getattr or getitem access, one of the
@@ -1231,44 +1226,39 @@ def HASATTR(self, guard: Guard):
             guard, [code], provided_guarded_object=self.get(base)
         )
 
-        if config.enable_cpp_guard_manager:
-            base_manager = self.get_guard_manager_from_source(base_source)
-            if val:
-                # Just install a getattr manager. GetAttrGuardAccessor itself
-                # acts as hasattr guard.
-                example_value = self.get(source.name())
-                base_example_value = self.get(base)
-                guard_manager_enum = self.get_guard_manager_type(source, example_value)
-
-                # if the base value is nn.Module, check if we can speedup the
-                # guard by going through __dict__ attrs.
-                if (
-                    isinstance(base_example_value, torch.nn.Module)
-                    and get_custom_getattr(base_example_value)
-                    is unpatched_nn_module_getattr
-                ):
-                    return self.getattr_on_nn_module(
-                        source,
-                        base_manager,
-                        base_example_value,
-                        example_value,
-                        base,
-                        source.name(),
-                        guard_manager_enum,
-                    )
-                else:
-                    base_manager.getattr_manager(
-                        attr=attr,
-                        source=guard.name,
-                        example_value=example_value,
-                        guard_manager_enum=guard_manager_enum,
-                    )
+        base_manager = self.get_guard_manager_from_source(base_source)
+        if val:
+            # Just install a getattr manager. GetAttrGuardAccessor itself
+            # acts as hasattr guard.
+            example_value = self.get(source.name())
+            base_example_value = self.get(base)
+            guard_manager_enum = self.get_guard_manager_type(source, example_value)
+
+            # if the base value is nn.Module, check if we can speedup the
+            # guard by going through __dict__ attrs.
+            if (
+                isinstance(base_example_value, torch.nn.Module)
+                and get_custom_getattr(base_example_value)
+                is unpatched_nn_module_getattr
+            ):
+                return self.getattr_on_nn_module(
+                    source,
+                    base_manager,
+                    base_example_value,
+                    example_value,
+                    base,
+                    source.name(),
+                    guard_manager_enum,
+                )
             else:
-                base_manager.add_no_hasattr_guard(
-                    attr, get_verbose_code_parts(code, guard)
+                base_manager.getattr_manager(
+                    attr=attr,
+                    source=guard.name,
+                    example_value=example_value,
+                    guard_manager_enum=guard_manager_enum,
                 )
         else:
-            self._produce_guard_code(guard, [code])
+            base_manager.add_no_hasattr_guard(attr, get_verbose_code_parts(code, guard))
 
     def NOT_PRESENT_IN_GENERIC_DICT(self, guard: Guard, attr=None) -> None:
         assert attr is not None
@@ -1297,12 +1287,9 @@ def TYPE_MATCH(self, guard: Guard) -> None:
         code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_type_match_guard(
-                obj_id, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        self.get_guard_manager(guard).add_type_match_guard(
+            obj_id, get_verbose_code_parts(code, guard)
+        )
 
     def DICT_VERSION(self, guard: Guard):
         # ___check_dict_version is same as `dict_version(x) == y`
@@ -1312,14 +1299,11 @@ def DICT_VERSION(self, guard: Guard):
         code = f"___dict_version({ref}) == {version}"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            # TODO(anijain2305) - Delete this when DictGuardManager uses tags
-            # for dicts.
-            self.get_guard_manager(guard).add_dict_version_guard(
-                val, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        # TODO(anijain2305) - Delete this when DictGuardManager uses tags
+        # for dicts.
+        self.get_guard_manager(guard).add_dict_version_guard(
+            val, get_verbose_code_parts(code, guard)
+        )
 
     def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
         dict_ref = self.arg_ref(guard)
@@ -1328,12 +1312,9 @@ def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
         code = f"{maybe_not}___dict_contains({key!r}, {dict_ref})"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_dict_contains_guard(
-                not invert, key, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        self.get_guard_manager(guard).add_dict_contains_guard(
+            not invert, key, get_verbose_code_parts(code, guard)
+        )
 
     def ID_MATCH(self, guard: Guard):
         # ___check_obj_id is same as `id(x) == y`
@@ -1349,12 +1330,9 @@ def ID_MATCH(self, guard: Guard):
         code = f"___check_obj_id({ref}, {id_val})"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_id_match_guard(
-                id_val, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        self.get_guard_manager(guard).add_id_match_guard(
+            id_val, get_verbose_code_parts(code, guard)
+        )
 
         # Keep track of ID_MATCH'd objects. This will be used to modify the
         # cache size logic
@@ -1375,32 +1353,22 @@ def NOT_NONE_MATCH(self, guard: Guard, value=None):
         code = f"{ref} is not None"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_not_none_guard(
-                get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        self.get_guard_manager(guard).add_not_none_guard(
+            get_verbose_code_parts(code, guard)
+        )
 
     def NAME_MATCH(self, guard: Guard):
         self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
 
     def DATA_PTR_MATCH(self, guard: Guard):
-        # Add a type check. C++ guard has the type check internally, so only
-        # enable it for Python guards.
-        if not config.enable_cpp_guard_manager:
-            self.TYPE_MATCH(guard)
-
+        # C++ guard has the type check internally
         obj = self.get(guard.name)
         code = f"{self.arg_ref(guard)}.data_ptr() == {obj.data_ptr()}"
         self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_data_ptr_guard(
-                obj, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, [code])
+        self.get_guard_manager(guard).add_data_ptr_guard(
+            obj, get_verbose_code_parts(code, guard)
+        )
 
     def DUAL_LEVEL(self, guard: Guard):
         # Invalidate dual level if current dual level is different than the one
@@ -1408,19 +1376,15 @@ def DUAL_LEVEL(self, guard: Guard):
         dual_level = torch.autograd.forward_ad._current_level
         code = [f"torch.autograd.forward_ad._current_level == {dual_level}"]
         self._set_guard_export_info(guard, [code])
-        if config.enable_cpp_guard_manager:
-            # TODO(anijain2305) - Consider this moving this guard to C++
-            forward_ad = torch.autograd.forward_ad
+        # TODO(anijain2305) - Consider this moving this guard to C++
+        forward_ad = torch.autograd.forward_ad
 
-            def fn(x):
-                return forward_ad._current_level == dual_level
+        def fn(x):
+            return forward_ad._current_level == dual_level
 
-            assert self.guard_manager  # to make mypy happy
-            self.guard_manager.root.add_lambda_guard(
-                fn, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        self.guard_manager.root.add_lambda_guard(
+            fn, get_verbose_code_parts(code, guard)
+        )
 
     def FUNCTORCH_STACK_MATCH(self, guard: Guard):
         # Invalidate functorch code if current level is different than
@@ -1430,19 +1394,15 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard):
         code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
         self._set_guard_export_info(guard, code)
 
-        if config.enable_cpp_guard_manager:
-            # TODO(anijain2305) - Consider this moving this guard to C++
-            compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
+        # TODO(anijain2305) - Consider this moving this guard to C++
+        compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
-            def fn(x):
-                return compare_fn(states)
+        def fn(x):
+            return compare_fn(states)
 
-            assert self.guard_manager  # to make mypy happy
-            self.guard_manager.root.add_lambda_guard(
-                fn, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        self.guard_manager.root.add_lambda_guard(
+            fn, get_verbose_code_parts(code, guard)
+        )
 
     def TENSOR_SUBCLASS_METADATA_MATCH(self, guard: Guard):
         value = self.get(guard.name)
@@ -1461,15 +1421,9 @@ def metadata_checker(x):
                 return x.__tensor_flatten__()[1] == original_metadata
 
         global_name = f"___check_metadata_{id(metadata_checker)}_c{CompileContext.current_compile_id()}"
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_lambda_guard(
-                metadata_checker, get_verbose_code_parts(global_name, guard)
-            )
-        else:
-            global_scope = self.get("G")
-            global_scope[global_name] = metadata_checker
-            code = [f"{global_name}({self.get(guard.name)})"]
-            self._produce_guard_code(guard, code)
+        self.get_guard_manager(guard).add_lambda_guard(
+            metadata_checker, get_verbose_code_parts(global_name, guard)
+        )
 
     def EQUALS_MATCH(self, guard: Guard):
         ref = self.arg_ref(guard)
@@ -1540,13 +1494,10 @@ def EQUALS_MATCH(self, guard: Guard):
             code.append(f"__math_isnan({ref})")
             self._set_guard_export_info(guard, code)
 
-            if config.enable_cpp_guard_manager:
-                self.get_guard_manager(guard).add_lambda_guard(
-                    _get_closure_vars()["__math_isnan"],
-                    get_verbose_code_parts(code, guard),
-                )
-            else:
-                self._produce_guard_code(guard, code)
+            self.get_guard_manager(guard).add_lambda_guard(
+                _get_closure_vars()["__math_isnan"],
+                get_verbose_code_parts(code, guard),
+            )
             return
 
         # Python math library doesn't support complex nan, so we need to use numpy
@@ -1556,58 +1507,24 @@ def EQUALS_MATCH(self, guard: Guard):
             code.append(f"__numpy_isnan({ref})")
             self._set_guard_export_info(guard, code)
 
-            if config.enable_cpp_guard_manager:
-                self.get_guard_manager(guard).add_lambda_guard(
-                    _get_closure_vars()["__numpy_isnan"],
-                    get_verbose_code_parts(code, guard),
-                )
-            else:
-                self._produce_guard_code(guard, code)
-            return
-
-        if config.enable_cpp_guard_manager:
-            # Construct a debug string to put into the c++ equals match guard.
-            code = [f"{ref} == {val!r}"]
-            if istype(val, ok_mutable_types):
-                # C++ guards perform a pointer equality check to speedup guards, but the assumption is that the object
-                # is mutable. For a few corner cases like sets and lists, we make a deepcopy to purposefully fail the
-                # pointer equality check.
-                val = deepcopy(val)
-            self.get_guard_manager(guard).add_equals_match_guard(
-                val, get_verbose_code_parts(code, guard)
+            self.get_guard_manager(guard).add_lambda_guard(
+                _get_closure_vars()["__numpy_isnan"],
+                get_verbose_code_parts(code, guard),
             )
-            self._set_guard_export_info(guard, code)
             return
 
-        code = []
-
-        # If matching equality against list/tuple, we must also check that
-        # the internal types match.  (TODO: what about nested lists?)
-        if istype(val, (list, tuple)):
-            # NB: SEQUENCE_LENGTH takes care of the outer __check_type_id test
-            self.SEQUENCE_LENGTH(guard)
-
-            for idx, elem in enumerate(val):
-                code.append(
-                    f"___check_type_id({ref}[{idx}], {self.id_ref(type(elem))})"
-                )
-        else:
-            # Add type check to prevent equality check between tensor and non-tensor.
-            self.TYPE_MATCH(guard)
-
-        if istype(val, torch.Size):
-            val = tuple(val)
-
-        # Code object can not be compared against their string representation
-        # I.e `eval(f"{compile('2+2','','exec')!r}")` raises SyntaxError
-        assert not istype(val, types.CodeType)
-
-        # TODO: It feels like it would be better to just implement our own
-        # equality test in C that handles all of the necessary type checking
-        # and NaN tests
-        code.append(f"{ref} == {val!r}")
-        self._produce_guard_code(guard, code)
+        # Construct a debug string to put into the c++ equals match guard.
+        code = [f"{ref} == {val!r}"]
+        if istype(val, ok_mutable_types):
+            # C++ guards perform a pointer equality check to speedup guards, but the assumption is that the object
+            # is mutable. For a few corner cases like sets and lists, we make a deepcopy to purposefully fail the
+            # pointer equality check.
+            val = deepcopy(val)
+        self.get_guard_manager(guard).add_equals_match_guard(
+            val, get_verbose_code_parts(code, guard)
+        )
         self._set_guard_export_info(guard, code)
+        return
 
     def CONSTANT_MATCH(self, guard: Guard):
         val = self.get(guard.name)
@@ -1652,7 +1569,7 @@ def SEQUENCE_LENGTH(self, guard):
         value = self.get(guard.name)
         t = type(value)
 
-        if not (config.enable_cpp_guard_manager and isinstance(value, dict)):
+        if not isinstance(value, dict):
             # C++ DICT_LENGTH checks for type
             self.TYPE_MATCH(guard)
 
@@ -1663,40 +1580,30 @@ def SEQUENCE_LENGTH(self, guard):
             code.append(f"len({ref}) == {len(value)}")
 
         self._set_guard_export_info(guard, code)
-        if config.enable_cpp_guard_manager:
-            if isinstance(value, dict):
-                self.get_guard_manager(guard).add_dict_length_check_guard(
-                    len(value), get_verbose_code_parts(code, guard)
-                )
-            else:
-                self.get_guard_manager(guard).add_length_check_guard(
-                    len(value), get_verbose_code_parts(code, guard)
-                )
+        if isinstance(value, dict):
+            self.get_guard_manager(guard).add_dict_length_check_guard(
+                len(value), get_verbose_code_parts(code, guard)
+            )
         else:
-            self._produce_guard_code(guard, code)
+            self.get_guard_manager(guard).add_length_check_guard(
+                len(value), get_verbose_code_parts(code, guard)
+            )
 
     def TUPLE_ITERATOR_LEN(self, guard):
         ref = self.arg_ref(guard)
         value = self.get(guard.name)
         t = type(value)
 
-        if not config.enable_cpp_guard_manager:
-            # C++ guard already checks the type
-            self.TYPE_MATCH(guard)
-
         code = []
         code.append(f"___tuple_iterator_len({ref}) == {tuple_iterator_len(value)}")
         self._set_guard_export_info(guard, code)
 
-        if config.enable_cpp_guard_manager:
-            t = type(value)
-            obj_id = self.id_ref(t)
+        t = type(value)
+        obj_id = self.id_ref(t)
 
-            self.get_guard_manager(guard).add_tuple_iterator_length_guard(
-                tuple_iterator_len(value), obj_id, get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        self.get_guard_manager(guard).add_tuple_iterator_length_guard(
+            tuple_iterator_len(value), obj_id, get_verbose_code_parts(code, guard)
+        )
 
     # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
     def DUPLICATE_INPUT(self, guard, source_b):
@@ -1711,21 +1618,18 @@ def DUPLICATE_INPUT(self, guard, source_b):
         code = [f"{ref_b} is {ref_a}"]
         self._set_guard_export_info(guard, code)
 
-        if config.enable_cpp_guard_manager:
-            # Check that the guard has not been inserted already
-            key = (ref_a, ref_b)
-            if key in self._cached_duplicate_input_guards:
-                return
-            self._cached_duplicate_input_guards.add((ref_a, ref_b))
-            self._cached_duplicate_input_guards.add((ref_b, ref_a))
-
-            install_object_aliasing_guard(
-                self.get_guard_manager(guard),
-                self.get_guard_manager_from_source(source_b),
-                get_verbose_code_parts(code, guard),
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        # Check that the guard has not been inserted already
+        key = (ref_a, ref_b)
+        if key in self._cached_duplicate_input_guards:
+            return
+        self._cached_duplicate_input_guards.add((ref_a, ref_b))
+        self._cached_duplicate_input_guards.add((ref_b, ref_a))
+
+        install_object_aliasing_guard(
+            self.get_guard_manager(guard),
+            self.get_guard_manager_from_source(source_b),
+            get_verbose_code_parts(code, guard),
+        )
 
     def DICT_KEYS(self, guard):
         # Guard on the keys and their order
@@ -1746,24 +1650,18 @@ def DICT_KEYS(self, guard):
             code.append(f"list({ref}.keys()) == {const_keys_repr}")
 
         self._set_guard_export_info(guard, code)
-        if config.enable_cpp_guard_manager:
-            if self.requires_key_order_guarding(guard.originating_source):
-                self.guard_on_dict_keys_and_order(value, guard)
-            else:
-                self.guard_on_dict_keys_and_ignore_order(value, guard)
+        if self.requires_key_order_guarding(guard.originating_source):
+            self.guard_on_dict_keys_and_order(value, guard)
         else:
-            self._produce_guard_code(guard, code)
+            self.guard_on_dict_keys_and_ignore_order(value, guard)
 
     def WEAKREF_ALIVE(self, guard):
         code = [f"{self.arg_ref(guard)} is not None"]
 
         self._set_guard_export_info(guard, code)
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_not_none_guard(
-                get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        self.get_guard_manager(guard).add_not_none_guard(
+            get_verbose_code_parts(code, guard)
+        )
 
     def DICT_CONST_KEYS(self, guard):
         """Constant keys match"""
@@ -1771,21 +1669,14 @@ def DICT_CONST_KEYS(self, guard):
         value = self.get(guard.name)
         t = type(value)
 
-        if not config.enable_cpp_guard_manager:
-            # DictGuardManager supports TYPE_MATCH internally
-            self.TYPE_MATCH(guard)
-
         code = []
         code.append(f"list({ref}.keys()) == {list(value.keys())!r}")
         self._set_guard_export_info(guard, code)
 
-        if config.enable_cpp_guard_manager:
-            if self.requires_key_order_guarding(guard.originating_source):
-                self.guard_on_dict_keys_and_order(value, guard)
-            else:
-                self.guard_on_dict_keys_and_ignore_order(value, guard)
+        if self.requires_key_order_guarding(guard.originating_source):
+            self.guard_on_dict_keys_and_order(value, guard)
         else:
-            self._produce_guard_code(guard, code)
+            self.guard_on_dict_keys_and_ignore_order(value, guard)
 
     def EMPTY_NN_MODULE_HOOKS_DICT(self, guard):
         """Special guard to skip guards on empty hooks. This is controlled by skip_nnmodule_hook_guards"""
@@ -1817,12 +1708,9 @@ def DEFAULT_DEVICE(self, guard: Guard):
         code = [f"utils_device.CURRENT_DEVICE == {m.CURRENT_DEVICE!r}"]
         self._set_guard_export_info(guard, code)
 
-        if config.enable_cpp_guard_manager:
-            self.get_guard_manager(guard).add_default_device_guard(
-                get_verbose_code_parts(code, guard)
-            )
-        else:
-            self._produce_guard_code(guard, code)
+        self.get_guard_manager(guard).add_default_device_guard(
+            get_verbose_code_parts(code, guard)
+        )
 
     def SHAPE_ENV(self, guard: Guard):
         # Let's handle ShapeEnv guards.  To do this, we will resolve
@@ -1890,18 +1778,14 @@ def get_sources(t_id, dim):
         for code in code_parts:
             self._set_guard_export_info(guard, [code])
 
-        if config.enable_cpp_guard_manager:
-            # Install all the symbolic guards in one lambda guard. These are run
-            # at the very end of the RootGuardManager via epilogue guards.
-            # TODO(anijain2305,williamwen42) - Consider moving this to C++.
-            self.add_python_lambda_leaf_guard_to_root(
-                code_parts,
-                verbose_code_parts,
-                closure_vars={**SYMPY_INTERP, **_get_closure_vars()},
-            )
-        else:
-            for code in code_parts:
-                self._produce_guard_code(guard, [code], shape_env=True)
+        # Install all the symbolic guards in one lambda guard. These are run
+        # at the very end of the RootGuardManager via epilogue guards.
+        # TODO(anijain2305,williamwen42) - Consider moving this to C++.
+        self.add_python_lambda_leaf_guard_to_root(
+            code_parts,
+            verbose_code_parts,
+            closure_vars={**SYMPY_INTERP, **_get_closure_vars()},
+        )
 
     def TENSOR_MATCH(self, guard: Guard, value=None):
         # For FSDP modules, we can skip guards on nn module tensors because FSDP
@@ -1970,30 +1854,29 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                 self.tensor_check_names.append(tensor_name)
                 self.tensor_check_guards.append(guard)
 
-                if config.enable_cpp_guard_manager:
-                    guard_manager = self.get_guard_manager(guard)
-                    # Keep track of all the tensor guard managers to insert
-                    # NoAliasing check at the end.
-                    self.tensor_check_guard_managers.append(guard_manager)
-
-                    output_graph = self.check_fn_manager.output_graph
-                    metadata = output_graph.input_source_to_sizes_strides[
-                        guard.originating_source
-                    ]
-                    size = convert_to_concrete_values(metadata["size"])
-                    stride = convert_to_concrete_values(metadata["stride"])
-
-                    verbose_code_parts = get_verbose_code_parts(
-                        get_tensor_guard_code_part(value, tensor_name, size, stride),
-                        guard,
-                    )
-                    guard_manager.add_tensor_match_guard(
-                        value,
-                        size,
-                        stride,
-                        tensor_name,
-                        verbose_code_parts,
-                    )
+                guard_manager = self.get_guard_manager(guard)
+                # Keep track of all the tensor guard managers to insert
+                # NoAliasing check at the end.
+                self.tensor_check_guard_managers.append(guard_manager)
+
+                output_graph = self.check_fn_manager.output_graph
+                metadata = output_graph.input_source_to_sizes_strides[
+                    guard.originating_source
+                ]
+                size = convert_to_concrete_values(metadata["size"])
+                stride = convert_to_concrete_values(metadata["stride"])
+
+                verbose_code_parts = get_verbose_code_parts(
+                    get_tensor_guard_code_part(value, tensor_name, size, stride),
+                    guard,
+                )
+                guard_manager.add_tensor_match_guard(
+                    value,
+                    size,
+                    stride,
+                    tensor_name,
+                    verbose_code_parts,
+                )
 
             # A frame is valid for reuse with dynamic dimensions if the new
             # (user-requested) dynamic dimensions are a subset of the old
@@ -2033,10 +1916,9 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                     dynamic_indices = value._dynamo_dynamic_indices
                     code_part = f"(({tensor_name}._dynamo_dynamic_indices.issubset({dynamic_indices})) if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  # noqa: B950
                     code.append(code_part)
-                    if config.enable_cpp_guard_manager:
-                        self.get_guard_manager(guard).add_dynamic_indices_guard(
-                            dynamic_indices, get_verbose_code_parts(code_part, guard)
-                        )
+                    self.get_guard_manager(guard).add_dynamic_indices_guard(
+                        dynamic_indices, get_verbose_code_parts(code_part, guard)
+                    )
                 # In the case of us not having any dynamic dimension indices, we compiled the frame with no chance of
                 # raising for this specific tensor - and any inputs with more dynamic user directives specified must be recompiled.
                 else:
@@ -2044,23 +1926,12 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
                         f"hasattr({tensor_name}, '_dynamo_dynamic_indices') == False"
                     )
                     code.append(code_part)
-                    if config.enable_cpp_guard_manager:
-                        self.get_guard_manager(guard).add_no_hasattr_guard(
-                            "_dynamo_dynamic_indices",
-                            get_verbose_code_parts(code_part, guard),
-                        )
+                    self.get_guard_manager(guard).add_no_hasattr_guard(
+                        "_dynamo_dynamic_indices",
+                        get_verbose_code_parts(code_part, guard),
+                    )
             if len(code) > 0:
                 self._set_guard_export_info(guard, code)
-                if not config.enable_cpp_guard_manager:
-                    self._produce_guard_code(guard, code)
-
-    # A util that appends guarded code
-    def _produce_guard_code(self, guard, code_list, shape_env=False):
-        assert not config.enable_cpp_guard_manager
-        if shape_env:
-            self.shape_env_code.append(GuardCodeList(code_list, guard))
-        else:
-            self.code.append(GuardCodeList(code_list, guard))
 
     # A util that in the case of export, adds data onto guards
     def _set_guard_export_info(self, guard, code_list, provided_guarded_object=None):
@@ -2240,9 +2111,7 @@ def __init__(
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: Dict[int, ReferenceType[object]] = {}
-        self.guard_manager = None
-        if config.enable_cpp_guard_manager:
-            self.guard_manager = GuardManager()
+        self.guard_manager = GuardManager()
         self.output_graph = output_graph
         w_builder = None
 
@@ -2314,28 +2183,26 @@ def cleanup_builder(weak_b):
         # in some form.
         self.check_fn.id_matched_objs = builder.id_matched_objs
 
-        if config.enable_cpp_guard_manager:
-            # TODO: don't do the string rep, do something more structured here
-            torch._logging.trace_structured(
-                "dynamo_cpp_guards_str", payload_fn=lambda: str(self.guard_manager)
-            )
-            guards_log.debug("%s", self.guard_manager)
-            assert self.guard_manager  # to make mypy happy
-            self.guard_manager.id_matched_objs = builder.id_matched_objs
-            self.check_fn = self.guard_manager
-
-            # Check that the guard returns True. False means that we will always
-            # recompile.
-            # TODO(anijain2305, ydwu4) - Skipping export because of following test
-            # python -s test/dynamo/test_export.py -k test_export_with_symbool_inputs
-            if not output_graph.export:
-                if not self.guard_manager.check(output_graph.local_scope):
-                    reasons = get_guard_fail_reason_helper(
-                        self.guard_manager,  # type: ignore[arg-type]
-                        output_graph.local_scope,
-                        CompileContext.current_compile_id(),
-                    )
-                    raise AssertionError(f"Guard check failed: {reasons}")
+        # TODO: don't do the string rep, do something more structured here
+        torch._logging.trace_structured(
+            "dynamo_cpp_guards_str", payload_fn=lambda: str(self.guard_manager)
+        )
+        guards_log.debug("%s", self.guard_manager)
+        self.guard_manager.id_matched_objs = builder.id_matched_objs
+        self.check_fn = self.guard_manager
+
+        # Check that the guard returns True. False means that we will always
+        # recompile.
+        # TODO(anijain2305, ydwu4) - Skipping export because of following test
+        # python -s test/dynamo/test_export.py -k test_export_with_symbool_inputs
+        if not output_graph.export:
+            if not self.guard_manager.check(output_graph.local_scope):
+                reasons = get_guard_fail_reason_helper(
+                    self.guard_manager,  # type: ignore[arg-type]
+                    output_graph.local_scope,
+                    CompileContext.current_compile_id(),
+                )
+                raise AssertionError(f"Guard check failed: {reasons}")
 
         # NB - We have to very careful of cleaning up here. Because of the
         # invalidate function, we can create a weakref finalizer that keeps
@@ -2363,26 +2230,15 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn):
             self.torch_function_mode_stack
         )
 
-        if config.enable_cpp_guard_manager:
-            # Insert the global_state guard
-            assert self.guard_manager  # to make mypy happy
-            self.guard_manager.root.add_global_state_guard(["___check_global_state()"])
+        # Insert the global_state guard
+        self.guard_manager.root.add_global_state_guard(["___check_global_state()"])
 
-            self.guard_manager.root.add_torch_function_mode_stack_guard(
-                self.torch_function_mode_stack,
-                ["___check_torch_function_mode_stack()"],
-            )
-            # Clear references to torch_function modes held in the list
-            self.torch_function_mode_stack = None
-        else:
-            # Don't report this guard, it's always the same, useless!
-            global_guard = "___check_global_state()"
-            code_parts.append(global_guard)
-            verbose_code_parts.append(global_guard)
-
-            tf_mode_stack_guard = "___check_torch_function_mode_stack()"
-            code_parts.append(tf_mode_stack_guard)
-            verbose_code_parts.append(tf_mode_stack_guard)
+        self.guard_manager.root.add_torch_function_mode_stack_guard(
+            self.torch_function_mode_stack,
+            ["___check_torch_function_mode_stack()"],
+        )
+        # Clear references to torch_function modes held in the list
+        self.torch_function_mode_stack = None
 
         def add_code_part(code_part, guard, log_only=False):
             verbose_code_part = get_verbose_code_part(code_part, guard)
@@ -2427,54 +2283,14 @@ def add_code_part(code_part, guard, log_only=False):
                 if code not in seen:
                     # If Cpp guard manager is enabled, we don't need to add to
                     # code_parts.
-                    add_code_part(code, gcl.guard, config.enable_cpp_guard_manager)
+                    add_code_part(code, gcl.guard, True)
                     seen.add(code)
 
         tensor_check_names = builder.tensor_check_names
         check_tensors_fn = None
         check_tensors_verbose_fn = None
-        if tensor_check_names and not config.enable_cpp_guard_manager:
-            tensor_check_guards = builder.tensor_check_guards
-            assert (
-                not self.output_graph.export
-            ), "Illegal to set tensor_check_names in export."
-            tensor_check_examples = builder.tensor_check_examples
-
-            dynamic_dims_sizes = []
-            dynamic_dims_strides = []
-            for t, g in zip(tensor_check_examples, tensor_check_guards):
-                metadata = self.output_graph.input_source_to_sizes_strides[
-                    g.originating_source
-                ]
-                dynamic_dims_sizes.append(convert_to_concrete_values(metadata["size"]))
-                dynamic_dims_strides.append(
-                    convert_to_concrete_values(metadata["stride"])
-                )
 
-            tensor_guards = TensorGuards(
-                *tensor_check_examples,
-                dynamic_dims_sizes=dynamic_dims_sizes,
-                dynamic_dims_strides=dynamic_dims_strides,
-            )
-            check_tensors_fn = tensor_guards.check
-            check_tensors_verbose_fn = tensor_guards.check_verbose
-            tensor_check_args = ", ".join(
-                tensor_check_names + ["tensor_check_names=tensor_check_names"]
-            )
-            # Do this manually, to un-stagger the guards in log message
-            code_parts.append(f"___check_tensors({tensor_check_args})")
-            verbose_code_parts.append(f"___check_tensors({tensor_check_args})")
-
-            for i, name in enumerate(tensor_check_names):
-                # This is a copy of what guards.cpp checks against
-                # Keep this in sync with TensorCheck constructor
-                t = tensor_check_examples[i]
-                sizes = dynamic_dims_sizes[i]
-                strides = dynamic_dims_strides[i]
-                code_part = get_tensor_guard_code_part(t, name, sizes, strides)
-                add_code_part(code_part, tensor_check_guards[i], log_only=True)
-
-        if len(tensor_check_names) > 1 and config.enable_cpp_guard_manager:
+        if len(tensor_check_names) > 1:
             # Install tensor aliasing guard. TENSOR_MATCH guards are already
             # installed for cpp guard manager.
             install_no_tensor_aliasing_guard(
@@ -2497,13 +2313,12 @@ def add_code_part(code_part, guard, log_only=False):
                 source_a = guard.input_source_a
                 source_b = guard.input_source_b
                 code_part = f"{source_a.name()} is {source_b.name()}"
-                if config.enable_cpp_guard_manager:
-                    install_object_aliasing_guard(
-                        builder.get_guard_manager_from_source(source_a),
-                        builder.get_guard_manager_from_source(source_b),
-                        [code_part],
-                    )
-                add_code_part(code_part, None, config.enable_cpp_guard_manager)
+                install_object_aliasing_guard(
+                    builder.get_guard_manager_from_source(source_a),
+                    builder.get_guard_manager_from_source(source_b),
+                    [code_part],
+                )
+                add_code_part(code_part, None, True)
             else:
                 raise RuntimeError(f"Unknown GuardEnvExpr: {guard}")
 
@@ -2513,7 +2328,7 @@ def add_code_part(code_part, guard, log_only=False):
             for code in gcl.code_list:
                 # Shape env guards are already added for CPP guard manager in
                 # SHAPE_ENV implementation.
-                add_code_part(code, gcl.guard, config.enable_cpp_guard_manager)
+                add_code_part(code, gcl.guard, True)
 
         # OK, all done generating guards
         if structured_guard_fns:
@@ -2536,44 +2351,18 @@ def add_code_part(code_part, guard, log_only=False):
         }
 
         globals_for_guard_fn = {"G": builder.scope["G"]}
-        if config.enable_cpp_guard_manager:
-            # Guard manager construction is complete
-            assert self.guard_manager  # to make mypy happy
-            # TODO (anijain2305) - When enable_cpp_guard_manager is ON by
-            # default, change the guard_fn name to be guard_manager everywhere
-            # to avoid confusion.
-            guard_fn = self.guard_manager
-            # Ensure we did not miss to insert a guard in cpp guard manager.
-            assert len(code_parts) == 0
-        else:
-            unique_code_parts = list(unique(code_parts))
-            make_guard_fn_args = ", ".join(closure_vars.keys())
-            guard_body, pycode = build_guard_function(
-                unique_code_parts, make_guard_fn_args
-            )
-
-            if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
-                print("GUARDS\n", guard_body)
-
-            out: Dict[str, Any] = {}
-
-            # We don't put builder.scope as the globals in exec call because
-            # guard_fn.__globals__ becomes equal to builder.scope. This causes
-            # guard_fn to hold a referece to f_locals sitting in builder.scope["L"]
-            try:
-                exec(pycode, globals_for_guard_fn, out)
-            except SyntaxError as ex:
-                log.exception("Failed to exec guard at line %s.\n%s", ex.lineno, pycode)
-                raise
-            guard_fn = out["___make_guard_fn"](*closure_vars.values())
+        # Guard manager construction is complete
+        # TODO (anijain2305) - When enable_cpp_guard_manager is ON by
+        # default, change the guard_fn name to be guard_manager everywhere
+        # to avoid confusion.
+        guard_fn = self.guard_manager
+        # Ensure we did not miss to insert a guard in cpp guard manager.
+        assert len(code_parts) == 0
 
         guard_fn.closure_vars = closure_vars
         # TODO(whc) maybe '.code_parts' was only kept around for the guard callback? so we don't need both
         guard_fn.args = largs
-        if config.enable_cpp_guard_manager:
-            guard_fn.populate_code_parts_for_debugging()
-        else:
-            guard_fn.code_parts = code_parts
+        guard_fn.populate_code_parts_for_debugging()
         guard_fn.verbose_code_parts = verbose_code_parts
         # Grab only G, but preserve "G" because guards access it as "G"
         guard_fn.global_scope = globals_for_guard_fn
@@ -2726,32 +2515,26 @@ def get_guard_fail_reason_helper(
     no_tensor_aliasing_check_failed = False
 
     verbose_code_parts: List[str] = []
-    if config.enable_cpp_guard_manager:
-        guard_manager = guard_fn
-        guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
-        # For test_export_with_map_cond, the check_verbose fail even without the
-        # C++ guard manager. We need to fix the issue to remove the comment.
-        # assert not guard_debug_info.result
-        if not guard_debug_info.result:
-            verbose_code_parts = guard_debug_info.verbose_code_parts
-            # verbose_code_parts is either the actual reason (e.g. in case of
-            # TENSOR_MATCH) or it could be a list of verbose_code_part that we
-            # passed to the leaf guard at construction time. If its a list, we
-            # walk through this list and find the guard that failed. This is
-            # very important for symbolic shape guards which are currently
-            # installed as a lambda guard and can encompass a long list of code_parts.
-
-            if len(verbose_code_parts) == 1:
-                if "Duplicate tensor found" in verbose_code_parts[0]:
-                    no_tensor_aliasing_check_failed = True
-                else:
-                    reasons = verbose_code_parts
-                    verbose_code_parts = []
-    else:
-        verbose_code_parts = guard_fn.verbose_code_parts
-        # This is not needed for CPP guard because the verbose check is already
-        # run in C++.
-        scope["___check_tensors"] = scope["___check_tensors_verbose"]
+    guard_manager = guard_fn
+    guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
+    # For test_export_with_map_cond, the check_verbose fail even without the
+    # C++ guard manager. We need to fix the issue to remove the comment.
+    # assert not guard_debug_info.result
+    if not guard_debug_info.result:
+        verbose_code_parts = guard_debug_info.verbose_code_parts
+        # verbose_code_parts is either the actual reason (e.g. in case of
+        # TENSOR_MATCH) or it could be a list of verbose_code_part that we
+        # passed to the leaf guard at construction time. If its a list, we
+        # walk through this list and find the guard that failed. This is
+        # very important for symbolic shape guards which are currently
+        # installed as a lambda guard and can encompass a long list of code_parts.
+
+        if len(verbose_code_parts) == 1:
+            if "Duplicate tensor found" in verbose_code_parts[0]:
+                no_tensor_aliasing_check_failed = True
+            else:
+                reasons = verbose_code_parts
+                verbose_code_parts = []
 
     if no_tensor_aliasing_check_failed:
         reasons = recompilation_reason_for_no_tensor_aliasing_guard(guard_fn, scope)
@@ -2876,8 +2659,7 @@ def guard_error_hook(
     print("lambda " + ", ".join(guard_fn.args) + ":")
     print(" ", " and\n  ".join(guard_fn.code_parts))
 
-    if config.enable_cpp_guard_manager:
-        print(guard_fn)
+    print(guard_fn)
 
     local_scope = {"L": f_locals, **guard_fn.closure_vars}
     for guard in guard_fn.code_parts:
diff --git a/torch/csrc/dynamo/cache_entry.cpp b/torch/csrc/dynamo/cache_entry.cpp
index bf89decf51930..6ea8a441c48fb 100644
--- a/torch/csrc/dynamo/cache_entry.cpp
+++ b/torch/csrc/dynamo/cache_entry.cpp
@@ -16,11 +16,8 @@ CacheEntry::CacheEntry(const py::handle& guarded_code, PyObject* backend)
   } else {
     this->trace_annotation = "Unknown";
   }
-  // TODO - clean this up when enable_cpp_guard_manager is True by default
-  if (py::hasattr(this->check_fn, "root")) {
-    this->root_mgr = torch::dynamo::convert_to_root_guard_manager(
-        this->check_fn.attr("root"));
-  }
+  this->root_mgr =
+      torch::dynamo::convert_to_root_guard_manager(this->check_fn.attr("root"));
 }
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index 73e665e221b63..1c1632b22746b 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -126,14 +126,8 @@ void lookup(
 
     if (valid) {
       try {
-        // TODO(anijain2305) - Clean this up when enable_cpp_guard_manager is
-        // True by default
-        if (cache_entry.root_mgr != nullptr) {
-          valid = torch::dynamo::run_root_guard_manager(
-              cache_entry.root_mgr, f_locals);
-        } else {
-          valid = cache_entry.check_fn(locals).cast<bool>();
-        }
+        valid = torch::dynamo::run_root_guard_manager(
+            cache_entry.root_mgr, f_locals);
       } catch (py::error_already_set& e) {
         if (guard_error_hook) {
           py::handle guard_error_hook_handle(guard_error_hook);

From 2980aed65b6c521e41ec8a995f4c94f184dd741b Mon Sep 17 00:00:00 2001
From: Xuan Zhang <xuanzh@meta.com>
Date: Fri, 25 Oct 2024 17:19:34 +0000
Subject: [PATCH 067/161] [inductor][memory] restructuring memory.py and turn
 on the flag (#137205)

Addressing additional comments given in PR https://github.com/pytorch/pytorch/pull/134874

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137205
Approved by: https://github.com/eellison
---
 .../test_replicate_with_compiler.py           |  16 +-
 test/dynamo/test_logging.py                   |   4 +-
 test/inductor/test_memory.py                  |   4 +-
 torch/_inductor/config.py                     |   2 +-
 torch/_inductor/memory.py                     | 527 ++++++++++--------
 5 files changed, 325 insertions(+), 228 deletions(-)

diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
index 01982effafb5a..0a072ec4ab3ff 100644
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -222,14 +222,18 @@ def test_compile_cpu_no_sync(self):
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
-    @torch._inductor.config.patch(reorder_for_locality=False)
+    @torch._inductor.config.patch(
+        reorder_for_locality=False, reorder_for_peak_memory=False
+    )
     def test_compile_gpu(self):
         self._test_compile(use_gpu=True, no_sync=False, checkpoint=False)
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @skip_if_rocm_multiprocess
     @skip_if_lt_x_gpu(2)
-    @torch._inductor.config.patch(reorder_for_locality=False)
+    @torch._inductor.config.patch(
+        reorder_for_locality=False, reorder_for_peak_memory=False
+    )
     def test_compile_gpu_ac(self):
         self._test_compile(use_gpu=True, no_sync=False, checkpoint=True)
 
@@ -313,7 +317,9 @@ def bwd(loss):
     )
     # todo: This pass mucks things up since Inductor thinks its inference
     # and can apply this. Should turn off these passes in compiled autograd
-    @torch._inductor.config.patch(reorder_for_locality=False)
+    @torch._inductor.config.patch(
+        reorder_for_locality=False, reorder_for_peak_memory=False
+    )
     def test_bucketing_coalesced_op(self):
         # Gradient is None
         code = self._test_bucketing()
@@ -349,7 +355,9 @@ def test_bucketing_coalesced_op(self):
     )
     # todo: This pass mucks things up since Inductor thinks its inference
     # and can apply this. Should turn off these passes in compiled autograd
-    @torch._inductor.config.patch(reorder_for_locality=False)
+    @torch._inductor.config.patch(
+        reorder_for_locality=False, reorder_for_peak_memory=False
+    )
     def test_bucketing_concat_op(self):
         # Gradient is None
         code = self._test_bucketing()
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index cc1b569edecbb..9ef49da2037fd 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -162,8 +162,8 @@ def test_dynamo_error(self, records):
         )
 
     test_aot = within_range_record_test(2, 6, aot=logging.INFO)
-    test_inductor_debug = within_range_record_test(3, 17, inductor=logging.DEBUG)
-    test_inductor_info = within_range_record_test(2, 4, inductor=logging.INFO)
+    test_inductor_debug = within_range_record_test(3, 22, inductor=logging.DEBUG)
+    test_inductor_info = within_range_record_test(2, 9, inductor=logging.INFO)
 
     @make_logging_test()
     def test_inductor_error(self, records):
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index 8e378a56a569e..185095673a6b5 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -147,10 +147,10 @@ def reorder_with_only_bfs(
                 FileCheck()
                 .check("def call(args):")
                 .check("buf0 = ")
-                .check("buf2 = ")
                 .check("buf1 = ")
-                .check("buf4 = ")
+                .check("buf2 = ")
                 .check("buf3 = ")
+                .check("buf4 = ")
                 .check("buf5 = ")
                 .check("buf7 = ")
                 .run(code)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index ad5db7eb04daa..f39051db75ec7 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -271,7 +271,7 @@ def bundled_autotune_remote_cache_default() -> Optional[bool]:
 ]
 
 # enable operator reordering for peak memory optimization
-reorder_for_peak_memory = os.environ.get("TORCHINDUCTOR_REORDER_FOR_PEAK_MEMORY") == "1"
+reorder_for_peak_memory = True
 
 # runtime estimation function for ops
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index e0849791b3485..ce260177c2d10 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import collections
 import dataclasses
 import heapq
 import logging
-from typing import Callable, Dict, List, Set, Tuple, TYPE_CHECKING, Union
+from typing import Callable, Dict, List, Set, Tuple, TYPE_CHECKING, TypedDict, Union
 
 from torch._utils_internal import signpost_event
 from torch.utils._ordered_set import OrderedSet
@@ -28,48 +29,35 @@ class MemoryPlanningInfoForBuffer:
     succ_nodes: OrderedSet[BaseSchedulerNode] = dataclasses.field(
         default_factory=OrderedSet
     )
-    outdegree: int = 0  # this is used only in topological_sort_lpmf
 
 
 @dataclasses.dataclass
 class MemoryPlanningInfoForNode:
-    pred_buffers: List[Union[SchedulerBuffer, FreeableInputBuffer]] = dataclasses.field(
-        default_factory=list
-    )
-    pred_nodes: List[BaseSchedulerNode] = dataclasses.field(default_factory=list)
-    succ_nodes: List[BaseSchedulerNode] = dataclasses.field(default_factory=list)
-    indegree: int = 0
     index: int = 0
     size: int = 0
-    memory_to_free: int = 0  # this is used only in topological_sort_lpmf
-    size_with_reads: int = 0  # this is used only in topological_sort_dfs
+    pred_buffers: OrderedSet[
+        Union[SchedulerBuffer, FreeableInputBuffer]
+    ] = dataclasses.field(default_factory=OrderedSet)
+    pred_nodes: OrderedSet[BaseSchedulerNode] = dataclasses.field(
+        default_factory=OrderedSet
+    )
+    succ_nodes: OrderedSet[BaseSchedulerNode] = dataclasses.field(
+        default_factory=OrderedSet
+    )
 
 
 @dataclasses.dataclass
 class FreeableInputBuffer:
-    dep: Dep
+    name: str
     mpi_buffer: MemoryPlanningInfoForBuffer = dataclasses.field(
         default_factory=MemoryPlanningInfoForBuffer
     )
 
     def get_name(self) -> str:
-        return self.dep.name
+        return self.name
 
     def __hash__(self) -> int:
-        return hash(self.dep.name)
-
-
-def dep_size_hint(dep: Dep) -> int:
-    res = 0
-    try:
-        if not dep.has_unbacked_symbols():
-            res = dep.numbytes_hint()
-    except KeyError:
-        # In at least one test (test/inductor/test_torchbind.py) we
-        # create a StarDep that doesn't exist in the graph and calling
-        # `has_unbacked_symbols()` throws an error.
-        pass
-    return res
+        return hash(self.name)
 
 
 def get_freeable_input_buf(
@@ -78,22 +66,52 @@ def get_freeable_input_buf(
 ) -> Dict[str, FreeableInputBuffer]:
     """
     Create and keep track of all input buffers that can be freed during the program
+
+    Returns:
+        A dictionary containing all freeble input buffers, keyed by their names.
     """
-    name_to_input_buf: Dict[str, FreeableInputBuffer] = {}
+
+    # this function is copied from torch/_inductor/scheduler.py
+    # TODO: would be nice to remove the try/except block for both places
+    def _dep_size_hint(dep: Dep) -> int:
+        res = 0
+        try:
+            if not dep.has_unbacked_symbols():
+                res = dep.numbytes_hint()
+        except KeyError:
+            # In at least one test (test/inductor/test_torchbind.py) we
+            # create a StarDep that doesn't exist in the graph and calling
+            # `has_unbacked_symbols()` throws an error.
+            pass
+        return res
+
+    # get freeable input buffers' successor nodes and their sizes
+    # note that different deps can have the same name, so we use name as keys
+    dep_name_to_succ_nodes: Dict[
+        str, OrderedSet[BaseSchedulerNode]
+    ] = collections.defaultdict(OrderedSet)
+    dep_name_to_size: Dict[str, int] = dict()
     for node in nodes:
         for dep in node.read_writes.reads:
-            if (
-                dep.name in graph_inputs
-                and not dep.name.startswith("primals_")
-                and dep.name not in name_to_input_buf
-            ):
-                name_to_input_buf[dep.name] = FreeableInputBuffer(dep)
-                name_to_input_buf[dep.name].mpi_buffer.size_free = dep_size_hint(dep)
-
-    return name_to_input_buf
+            if dep.name in graph_inputs and not dep.name.startswith("primals_"):
+                dep_name_to_succ_nodes[dep.name].add(node)
+                dep_name_to_size[dep.name] = _dep_size_hint(dep)
+
+    # create FreeableInputBuffer objects and add them to the returned dictionary
+    name_to_freeable_input_buf: Dict[str, FreeableInputBuffer] = dict()
+    for dep_name, succ_nodes in dep_name_to_succ_nodes.items():
+        name_to_freeable_input_buf[dep_name] = FreeableInputBuffer(
+            dep_name,
+            MemoryPlanningInfoForBuffer(
+                size_free=dep_name_to_size[dep_name], succ_nodes=succ_nodes
+            ),
+        )
+    return name_to_freeable_input_buf
 
 
-def compute_size_for_scheduler_buffer(name_to_buf: Dict[str, SchedulerBuffer]) -> None:
+def compute_size_for_scheduler_buffer(
+    name_to_buf: Dict[str, SchedulerBuffer]
+) -> Dict[str, Tuple[int, int]]:
     """
     Compute the size of each scheduler buffer, including (1) memory allocated when
     it is created and (2) memory deallocated when it is freed.
@@ -107,63 +125,127 @@ def compute_size_for_scheduler_buffer(name_to_buf: Dict[str, SchedulerBuffer]) -
         buf0: at creation, 30 bytes allocated, when deleted, 0 bytes freed
         buf1: at creation, 0 bytes allocated, when deleted, 10 bytes freed
         buf2: at creation, 0 bytes allocated, when deleted, 20 bytes freed
+
+    Returns:
+        A dictionary mapping a scheduler buffer to a tuple of (size_alloc, size_free).
     """
-    from .scheduler import BaseSchedulerNode, OutputNode
+    from .ir import MultiOutput
+    from .scheduler import OutputNode
 
-    # compute the size of SchedulerBuffer without MultiOutputLayout layout
-    for sched_buf in name_to_buf.values():
-        if not isinstance(sched_buf.node.layout, MultiOutputLayout):
-            sched_buf.mpi_buffer = MemoryPlanningInfoForBuffer()
-            sched_buf.mpi_buffer.size_alloc = V.graph.sizevars.size_hint(
-                sched_buf.node.get_numel(), fallback=0
-            ) * get_dtype_size(sched_buf.node.get_dtype())
-            sched_buf.mpi_buffer.size_free = sched_buf.mpi_buffer.size_alloc
+    sched_buf_to_size: Dict[str, Tuple[int, int]] = dict()
 
-    # compute the size of SchedulerBuffer with MultiOutputLayout layout
-    for sched_buf in name_to_buf.values():
+    def _compute_and_update_buf_size(
+        sched_buf: SchedulerBuffer, user_of_MultiOutputLayout: bool = False
+    ) -> int:
         if isinstance(sched_buf.node.layout, MultiOutputLayout):
-            sched_buf.mpi_buffer = MemoryPlanningInfoForBuffer()
+            size_alloc = 0
             for user in sched_buf.users:
                 if isinstance(user.node, OutputNode):
                     continue
-                assert isinstance(user.node, BaseSchedulerNode)
                 for buf in user.node.get_outputs():
-                    sched_buf.mpi_buffer.size_alloc += buf.mpi_buffer.size_alloc
-                    buf.mpi_buffer.size_alloc = 0
+                    if isinstance(buf.node, MultiOutput):
+                        size_alloc += _compute_and_update_buf_size(buf, True)
+            sched_buf_to_size[sched_buf.get_name()] = (
+                0 if user_of_MultiOutputLayout else size_alloc,
+                0,
+            )
+            return size_alloc
+        else:
+            buf_size = V.graph.sizevars.size_hint(
+                sched_buf.node.get_numel(), fallback=0
+            ) * get_dtype_size(sched_buf.node.get_dtype())
+            sched_buf_to_size[sched_buf.get_name()] = (
+                0 if user_of_MultiOutputLayout else buf_size,
+                buf_size,
+            )
+            return buf_size
 
+    for sched_buf in name_to_buf.values():
+        # skip if sched_buf is already processed as an user of another SchedulerBuffer
+        # whose layout is of the type MultiOutputLayout
+        if sched_buf.get_name() not in sched_buf_to_size:
+            _compute_and_update_buf_size(sched_buf)
+
+    return sched_buf_to_size
 
-def map_successor_nodes_with_predecessor_buffers(
+
+def assign_memory_planning_info_for_scheduler_buffers(
     nodes: List[BaseSchedulerNode],
-    name_to_input_buf: Dict[str, FreeableInputBuffer],
     name_to_buf: Dict[str, SchedulerBuffer],
 ) -> None:
     """
-    For scheduling and memory estimation, for each scheduler node, we maintain
-    a list of its dependency buffers (SchedulerBuffer and FreeableInputBuffer).
-    This is similar to node.read_writes.reads, which is a list of Dep.
-    Reversely, for each SchedulerBuffer / FreeableInputBuffer, assign its successor nodes.
+    For each SchedulerBuffer, assign its size info and successor nodes.
     A buffer's successor nodes determines when a buffer can be freed.
     """
+    # get buffer sizes
+    sched_buf_to_size = compute_size_for_scheduler_buffer(name_to_buf)
+
+    # get buffer's successor nodes
+    # note that different deps can have the same name, so we use name as keys
+    dep_name_to_succ_nodes: Dict[
+        str, OrderedSet[BaseSchedulerNode]
+    ] = collections.defaultdict(OrderedSet)
     for node in nodes:
-        node.mpi_node = MemoryPlanningInfoForNode()
-        node.mpi_node.pred_buffers = []
-        for dep_name in {dep.name for dep in node.unmet_dependencies}:
-            sched_buf = name_to_buf.get(dep_name)
-            if sched_buf:
-                node.mpi_node.pred_buffers.append(sched_buf)
-                sched_buf.mpi_buffer.succ_nodes.add(node)
-        for dep_name in {
-            dep.name for dep in node.read_writes.reads - node.unmet_dependencies
-        }:
-            input_buf = name_to_input_buf.get(dep_name)
-            if input_buf:
-                node.mpi_node.pred_buffers.append(input_buf)
-                input_buf.mpi_buffer.succ_nodes.add(node)
+        for dep in node.unmet_dependencies:
+            dep_name_to_succ_nodes[dep.name].add(node)
+
+    # populate the MemoryPlanningInfoForBuffer attribute to each scheduler buffer
+    # note: there are scheduler buffers not in dep_name_to_succ_nodes (e.g., graph outputs)
+    for buf_name in name_to_buf.keys():
+        name_to_buf[buf_name].mpi_buffer = MemoryPlanningInfoForBuffer(
+            size_alloc=sched_buf_to_size[buf_name][0],
+            size_free=sched_buf_to_size[buf_name][1],
+            succ_nodes=dep_name_to_succ_nodes[buf_name],
+        )
+
+
+def assign_memory_planning_info_for_scheduler_nodes(
+    nodes: List[BaseSchedulerNode],
+    name_to_fused_node: Dict[str, BaseSchedulerNode],
+    name_to_buf: Dict[str, SchedulerBuffer],
+    name_to_freeable_input_buf: Dict[str, FreeableInputBuffer],
+) -> None:
+    """
+    Assign to each scheduler node its predecessor and successor nodes.
+    """
+    from .scheduler import SchedulerBuffer
+
+    for index, node in enumerate(nodes):
+        size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
+        pred_buffers: OrderedSet[
+            Union[SchedulerBuffer, FreeableInputBuffer]
+        ] = OrderedSet()
+        for dep in node.read_writes.reads:
+            if dep.name in name_to_buf and dep in node.unmet_dependencies:
+                pred_buffers.add(name_to_buf[dep.name])
+            elif dep.name in name_to_freeable_input_buf:
+                pred_buffers.add(name_to_freeable_input_buf[dep.name])
+        pred_nodes = OrderedSet(
+            {
+                name_to_fused_node[pred_buffer.defining_op.get_name()]
+                for pred_buffer in pred_buffers
+                if (isinstance(pred_buffer, SchedulerBuffer))
+            }
+        )
+        succ_nodes = OrderedSet(
+            {
+                succ_node
+                for buffer in node.get_outputs()
+                for succ_node in buffer.mpi_buffer.succ_nodes
+            }
+        )
+        node.mpi_node = MemoryPlanningInfoForNode(
+            index=index,
+            size=size_alloc,
+            pred_buffers=pred_buffers,
+            pred_nodes=pred_nodes,
+            succ_nodes=succ_nodes,
+        )
 
 
 def estimate_peak_memory(
     nodes: List[BaseSchedulerNode],
-    name_to_input_buf: Dict[str, FreeableInputBuffer],
+    name_to_freeable_input_buf: Dict[str, FreeableInputBuffer],
     graph_outputs: Set[str],
 ) -> Tuple[int, List[int]]:
     """
@@ -172,63 +254,79 @@ def estimate_peak_memory(
 
     Returns:
         int: peak memory
-        List[int]: memory usage at each node.
+        List[int]: memory usage at each node (or each step).
     """
 
-    # map each scheduler buffer to its size, start time, and end time
+    # map each scheduler buffer to its size, start step, and end step
     @dataclasses.dataclass
     class BufferInfo:
         buffer: Union[SchedulerBuffer, FreeableInputBuffer]
         size_alloc: int
         size_free: int
-        start_time: int
-        end_time: int
-
-    name_to_buf_info: Dict[str, BufferInfo] = {}
-    node_name_to_time: Dict[str, int] = {}
-
-    # assign start_time
-    for buf_name, input_buf in name_to_input_buf.items():
-        name_to_buf_info[buf_name] = BufferInfo(
-            input_buf,
-            input_buf.mpi_buffer.size_free,
-            input_buf.mpi_buffer.size_free,
-            0,
-            0,
+        start_step: int
+        end_step: int
+
+    # get the execution step of each node, this will be used to determine
+    # the end_step of buffers
+    node_to_step: Dict[BaseSchedulerNode, int] = dict()
+    for step, node in enumerate(nodes):
+        node_to_step[node] = step
+
+    # get buffers' size and liveliness information
+    buf_info_list: List[BufferInfo] = []
+    # 1. for freeable input buffers
+    for buf_name, input_buf in name_to_freeable_input_buf.items():
+        end_step = (
+            len(nodes) - 1
+            if buf_name in graph_outputs
+            else max(
+                node_to_step[succ_node] for succ_node in input_buf.mpi_buffer.succ_nodes
+            )
         )
-    for t, node in enumerate(nodes):
-        node_name_to_time[node.get_name()] = t
-        for sched_buf in node.get_outputs():
-            name_to_buf_info[sched_buf.get_name()] = BufferInfo(
-                sched_buf,
-                sched_buf.mpi_buffer.size_alloc,
-                sched_buf.mpi_buffer.size_free,
-                t,
-                t,
+        buf_info_list.append(
+            BufferInfo(
+                input_buf,
+                input_buf.mpi_buffer.size_free,
+                input_buf.mpi_buffer.size_free,
+                0,
+                end_step,
             )
+        )
 
-    # assign end_time
-    for buf_name, buf_info in name_to_buf_info.items():
-        succ_node_time = [
-            node_name_to_time[succ_node.get_name()]
-            for succ_node in buf_info.buffer.mpi_buffer.succ_nodes
-            if succ_node.get_name() in node_name_to_time
-        ]
-        if succ_node_time:
-            buf_info.end_time = max(succ_node_time)
-
-    # the end time of output buffers should be at the end of the horizon
-    for buf_name in graph_outputs:
-        if buf_name in name_to_buf_info:
-            name_to_buf_info[buf_name].end_time = len(nodes) - 1
+    # 2. for scheduler buffers
+    for step, node in enumerate(nodes):
+        for sched_buf in node.get_outputs():
+            # note: it is possible for a non-graph-output sched_buf to have no succ_nodes and
+            # to be only used by its defining op (e.g., due to fusion when all consumers of
+            # the buffer are fused with its defining op). In such cases, end_step is step.
+            end_step = (
+                len(nodes) - 1
+                if sched_buf.get_name() in graph_outputs
+                else max(
+                    [
+                        node_to_step[succ_node]
+                        for succ_node in sched_buf.mpi_buffer.succ_nodes
+                    ],
+                    default=step,
+                )
+            )
+            buf_info_list.append(
+                BufferInfo(
+                    sched_buf,
+                    sched_buf.mpi_buffer.size_alloc,
+                    sched_buf.mpi_buffer.size_free,
+                    step,
+                    end_step,
+                )
+            )
 
-    # incremental memory changes at each time period
+    # incremental memory changes at each step
     memory = [0 for _ in range(len(nodes) + 1)]
 
     # for each buffer, update memory when created and when freed
-    for buf_name, buf_info in name_to_buf_info.items():
-        memory[buf_info.start_time] += buf_info.size_alloc
-        memory[buf_info.end_time + 1] -= buf_info.size_free
+    for buf_info in buf_info_list:
+        memory[buf_info.start_step] += buf_info.size_alloc
+        memory[buf_info.end_step + 1] -= buf_info.size_free
 
     # get peak memory by compute the cumulative memories
     max_memory = 0
@@ -242,42 +340,19 @@ class BufferInfo:
     return (max_memory, memories_at_nodes)
 
 
-def assign_predcessor_and_successor_nodes_to_nodes(
-    nodes: List[BaseSchedulerNode], name_to_fused_node: Dict[str, BaseSchedulerNode]
-) -> None:
-    """
-    Assign to each scheduler node its predecessor and successor nodes.
-    """
-    from .scheduler import SchedulerBuffer
-
-    for node in nodes:
-        node.mpi_node.pred_nodes = list(
-            {
-                name_to_fused_node[pred_buffer.defining_op.get_name()]
-                for pred_buffer in node.mpi_node.pred_buffers
-                if (
-                    isinstance(pred_buffer, SchedulerBuffer)
-                    and pred_buffer.defining_op.get_name() in name_to_fused_node
-                )
-            }
-        )
-        node.mpi_node.succ_nodes = list(
-            {
-                succ_node
-                for buffer in node.get_outputs()
-                for succ_node in buffer.mpi_buffer.succ_nodes
-            }
-        )
-
-
 def topological_sort_lpmf(
     nodes: List[BaseSchedulerNode],
-    name_to_input_buf: Dict[str, FreeableInputBuffer],
+    name_to_freeable_input_buf: Dict[str, FreeableInputBuffer],
     name_to_buf: Dict[str, SchedulerBuffer],
     graph_outputs: Set[str],
 ) -> List[BaseSchedulerNode]:
     """
     A bfs-based greedy topological order. LPMF stands for "Least Peak Memory First".
+
+    The idea is from this paper:
+    Buffer memory optimization for video codec application modeled in Simulink
+    https://www.cs.york.ac.uk/rts/docs/DAC-1964-2006/PAPERS/2006/DAC06/PDFFILES/P0689.PDF
+
     The algorithm maintain the max memory so far.
     At every iteration, for each scheduleable node, it computes:
         - how much memory needs to be allocated for the output buffers of this node;
@@ -291,53 +366,61 @@ def topological_sort_lpmf(
         (ii) otherwise, pick the one with the lowest mem1 value.
     """
 
+    class NodeInfo(TypedDict):
+        indegree: int
+        memory_to_free: int
+
+    class BufferInfo(TypedDict):
+        outdegree: int
+
+    node_info: Dict[BaseSchedulerNode, NodeInfo] = dict()
+    buf_info: Dict[Union[SchedulerBuffer, FreeableInputBuffer], BufferInfo] = dict()
+
     # compute nodes' number of unmet dependencies (for schedulability)
     # initialize the list of nodes ready to be scheduled
-    nodes_to_schedule: Set[BaseSchedulerNode] = set()
+    nodes_to_schedule: OrderedSet[BaseSchedulerNode] = OrderedSet()
     for node in nodes:
-        # note that .unmet_dependencies could have deps with the same name
-        # and in that case, it should only be counted once
-        node.mpi_node.indegree = len(node.mpi_node.pred_nodes)
-        if node.mpi_node.indegree == 0:
+        node_info[node] = {
+            "indegree": len(node.mpi_node.pred_nodes),
+            "memory_to_free": 0,
+        }
+        if node_info[node]["indegree"] == 0:
             nodes_to_schedule.add(node)
 
     # compute buffers' number of unmet successors (used to decide when to free)
-    for buf in list(name_to_buf.values()) + list(name_to_input_buf.values()):
-        buf.mpi_buffer.outdegree = len(buf.mpi_buffer.succ_nodes)
-        if buf.get_name() in graph_outputs:
-            buf.mpi_buffer.outdegree += 1
+    for buf in list(name_to_buf.values()) + list(name_to_freeable_input_buf.values()):
+        buf_info[buf] = {
+            "outdegree": len(buf.mpi_buffer.succ_nodes)
+            + (1 if buf.get_name() in graph_outputs else 0)
+        }
 
     # initialize memory estimations
     live_memory = sum(
-        input_buf.mpi_buffer.size_free for input_buf in name_to_input_buf.values()
+        input_buf.mpi_buffer.size_free
+        for input_buf in name_to_freeable_input_buf.values()
     )
 
     # this is the total output memory, which is a lower bound for peak memory
-    output_memory = sum(
-        name_to_buf[buf_name].mpi_buffer.size_free
-        for buf_name in graph_outputs
-        if buf_name in name_to_buf
-    )
+    # we do not include the memory of non freeable input buffers
+    output_memory = 0
+    for buf_name in graph_outputs:
+        if buf_name in name_to_buf:
+            output_memory += name_to_buf[buf_name].mpi_buffer.size_free
+        elif buf_name in name_to_freeable_input_buf:
+            output_memory += name_to_freeable_input_buf[buf_name].mpi_buffer.size_free
     max_memory = max(live_memory, output_memory)
 
     # compute the amount of memory that is allocated when a node is scheduled
     # and the amount of memory that can be freed when a node is scheduled
     for i, node in enumerate(nodes):
-        node.mpi_node.index = i  # keep track of the original order
-        node.mpi_node.size = sum(
-            buffer.mpi_buffer.size_alloc for buffer in node.get_outputs()
-        )
-        node.mpi_node.memory_to_free = 0
         # 1. if a buffer read by this node is last used by this node
-        #    then the buffer can be freed
         for buf in node.mpi_node.pred_buffers:
-            if buf.mpi_buffer.outdegree == 1:
-                node.mpi_node.memory_to_free += buf.mpi_buffer.size_free
-        # 2. if a buffer written by this node is used internally and
-        #    not needed afterwards, it can be freed
+            if buf_info[buf]["outdegree"] == 1:
+                node_info[node]["memory_to_free"] += buf.mpi_buffer.size_free
+        # 2. if a buffer written by this node is used internally and not used later
         for buf in node.get_outputs():
-            if buf.mpi_buffer.outdegree == 0:
-                node.mpi_node.memory_to_free += buf.mpi_buffer.size_free
+            if buf_info[buf]["outdegree"] == 0:
+                node_info[node]["memory_to_free"] += buf.mpi_buffer.size_free
 
     # schedule nodes one at a time
     schedule: List[BaseSchedulerNode] = []
@@ -348,7 +431,7 @@ def topological_sort_lpmf(
             nodes_to_schedule,
             key=lambda node: (
                 max(live_memory + node.mpi_node.size, max_memory),
-                node.mpi_node.size - node.mpi_node.memory_to_free,
+                node.mpi_node.size - node_info[node]["memory_to_free"],
                 node.mpi_node.index,
             ),
         )
@@ -359,22 +442,22 @@ def topological_sort_lpmf(
         # update memory usage
         live_memory += selected_node.mpi_node.size
         max_memory = max(max_memory, live_memory)
-        live_memory -= selected_node.mpi_node.memory_to_free
+        live_memory -= node_info[node]["memory_to_free"]
 
         # update successor nodes and nodes_to_schedule
         for succ_node in selected_node.mpi_node.succ_nodes:
-            assert succ_node.mpi_node.indegree > 0
-            succ_node.mpi_node.indegree -= 1
-            if succ_node.mpi_node.indegree == 0:
+            assert node_info[succ_node]["indegree"] > 0
+            node_info[succ_node]["indegree"] -= 1
+            if node_info[succ_node]["indegree"] == 0:
                 nodes_to_schedule.add(succ_node)
 
         # update predecessor nodes
         for buf in selected_node.mpi_node.pred_buffers:
-            assert buf.mpi_buffer.outdegree > 0
-            buf.mpi_buffer.outdegree -= 1
-            if buf.mpi_buffer.outdegree == 1:
+            assert buf_info[buf]["outdegree"] > 0
+            buf_info[buf]["outdegree"] -= 1
+            if buf_info[buf]["outdegree"] == 1:
                 for succ_node in buf.mpi_buffer.succ_nodes:
-                    succ_node.mpi_node.memory_to_free += buf.mpi_buffer.size_free
+                    node_info[succ_node]["memory_to_free"] += buf.mpi_buffer.size_free
 
     if num_iters > len(nodes):
         raise RuntimeError("Failed to schedule, while loop ran too long for lpmf")
@@ -392,34 +475,39 @@ def topological_sort_bfs(nodes: List[BaseSchedulerNode]) -> List[BaseSchedulerNo
     idea aims to reduce the liveness duration of buffers created.
     """
 
+    class NodeInfo(TypedDict):
+        indegree: int
+        order: int
+
+    node_info: Dict[BaseSchedulerNode, NodeInfo] = dict()
+
     @dataclasses.dataclass
-    class HeapElement:
+    class NodeWithPriority:
         priority: List[int]
         node: BaseSchedulerNode
 
-        def __lt__(self, other: HeapElement) -> bool:
+        def __lt__(self, other: NodeWithPriority) -> bool:
             if self.priority == other.priority:
                 return self.node.mpi_node.index < other.node.mpi_node.index
             return self.priority < other.priority
 
     def _node_priority(node: BaseSchedulerNode) -> List[int]:
-        assert node.mpi_node.indegree == 0
-        ids = sorted(
-            {pred_node.mpi_node.index for pred_node in node.mpi_node.pred_nodes}
+        # priority is the order in which predecessor nodes are executed
+        assert node_info[node]["indegree"] == 0
+        exec_orders = sorted(
+            {node_info[pred_node]["order"] for pred_node in node.mpi_node.pred_nodes}
         )
-        ids.append(node.mpi_node.index)
-        return ids
+        return exec_orders
 
     # compute nodes' number of unmet dependencies (for schedulability)
     # initialize the list of nodes ready to be scheduled
-    nodes_to_schedule: List[HeapElement] = []
-    for t, node in enumerate(nodes):
-        node.mpi_node.index = t
-        # note that .unmet_dependencies could have deps with the same name
-        # and in that case, it should only be counted once
-        node.mpi_node.indegree = len(node.mpi_node.pred_nodes)
-        if node.mpi_node.indegree == 0:
-            heapq.heappush(nodes_to_schedule, HeapElement(_node_priority(node), node))
+    nodes_to_schedule: List[NodeWithPriority] = []
+    for node in nodes:
+        node_info[node] = {"indegree": len(node.mpi_node.pred_nodes), "order": -1}
+        if node_info[node]["indegree"] == 0:
+            heapq.heappush(
+                nodes_to_schedule, NodeWithPriority(_node_priority(node), node)
+            )
 
     # schedule nodes one at a time
     schedule: List[BaseSchedulerNode] = []
@@ -427,22 +515,23 @@ def _node_priority(node: BaseSchedulerNode) -> List[int]:
     while num_iters < len(nodes) and nodes_to_schedule:
         # select a node to schedule
         selected_node = heapq.heappop(nodes_to_schedule).node
-        selected_node.mpi_node.index = len(schedule)
+        node_info[selected_node]["order"] = len(schedule)
         schedule.append(selected_node)
         num_iters += 1
 
         # update successor nodes and nodes_to_schedule
         for succ_node in selected_node.mpi_node.succ_nodes:
-            assert succ_node.mpi_node.indegree > 0
-            succ_node.mpi_node.indegree -= 1
-            if succ_node.mpi_node.indegree == 0:
+            assert node_info[succ_node]["indegree"] > 0
+            node_info[succ_node]["indegree"] -= 1
+            if node_info[succ_node]["indegree"] == 0:
                 heapq.heappush(
                     nodes_to_schedule,
-                    HeapElement(_node_priority(succ_node), succ_node),
+                    NodeWithPriority(_node_priority(succ_node), succ_node),
                 )
 
     if num_iters > len(nodes):
         raise RuntimeError("Failed to schedule, while loop ran too long for bfs")
+
     return schedule
 
 
@@ -458,6 +547,7 @@ def topological_sort_dfs(nodes: List[BaseSchedulerNode]) -> List[BaseSchedulerNo
     seen: OrderedSet[BaseSchedulerNode] = OrderedSet()
     name_to_node: Dict[str, BaseSchedulerNode] = dict()
     result: List[BaseSchedulerNode] = []
+    size_with_reads: Dict[BaseSchedulerNode, int] = dict()
 
     def visit(n: BaseSchedulerNode) -> None:
         if n not in seen:
@@ -468,7 +558,7 @@ def visit(n: BaseSchedulerNode) -> None:
                 if dep.name in name_to_node
             ]
             for node in sorted(
-                dep_nodes, key=lambda x: (x.mpi_node.size_with_reads, x.mpi_node.index)
+                dep_nodes, key=lambda n: (size_with_reads[n], n.mpi_node.index)
             ):
                 visit(node)
             result.append(n)
@@ -477,18 +567,13 @@ def visit(n: BaseSchedulerNode) -> None:
         for name in node.get_buffer_names():
             name_to_node[name] = node
 
-    for t, node in enumerate(nodes):
-        node.mpi_node.index = t
-        node.mpi_node.size = sum(
-            buffer.mpi_buffer.size_alloc for buffer in node.get_outputs()
-        )
-        node.mpi_node.size_with_reads = node.mpi_node.size + sum(
+    for node in nodes:
+        size_with_reads[node] = node.mpi_node.size + sum(
             pred_buf.mpi_buffer.size_free for pred_buf in node.mpi_node.pred_buffers
         )
-    for node in sorted(
-        nodes, key=lambda x: (x.mpi_node.size_with_reads, x.mpi_node.index)
-    ):
+    for node in sorted(nodes, key=lambda n: (size_with_reads[n], n.mpi_node.index)):
         visit(node)
+
     return result
 
 
@@ -509,7 +594,7 @@ def reorder_for_peak_memory(
     resulting topological order has the lowest peak memory estimation.
     """
 
-    torch_log.warning("Reordering for peak memory -- %d nodes", len(nodes))
+    torch_log.info("Reordering for peak memory -- %d nodes", len(nodes))
 
     @dataclasses.dataclass
     class PeakMemoryResult:
@@ -519,19 +604,20 @@ class PeakMemoryResult:
 
     # preparation --  as nodes are scheduled one at a time, these help
     # keep track of when a buffer can be freed, and when a node can be scheduled
-    name_to_input_buf: Dict[str, FreeableInputBuffer] = get_freeable_input_buf(
+    name_to_freeable_input_buf: Dict[str, FreeableInputBuffer] = get_freeable_input_buf(
         nodes, graph_inputs
     )
-    compute_size_for_scheduler_buffer(name_to_buf)
-    map_successor_nodes_with_predecessor_buffers(nodes, name_to_input_buf, name_to_buf)
-    assign_predcessor_and_successor_nodes_to_nodes(nodes, name_to_fused_node)
+    assign_memory_planning_info_for_scheduler_buffers(nodes, name_to_buf)
+    assign_memory_planning_info_for_scheduler_nodes(
+        nodes, name_to_fused_node, name_to_buf, name_to_freeable_input_buf
+    )
 
     # keep track of the peak memory estimates of different methods
     peak_memory_diff_methods: List[PeakMemoryResult] = []
 
     # the default
     estimated_peak_memory, _ = estimate_peak_memory(
-        nodes, name_to_input_buf, graph_outputs
+        nodes, name_to_freeable_input_buf, graph_outputs
     )
     peak_memory_diff_methods.append(
         PeakMemoryResult(nodes, estimated_peak_memory, "baseline")
@@ -542,12 +628,14 @@ class PeakMemoryResult:
     for method in methods:
         try:
             if method == topological_sort_lpmf:
-                order = method(nodes, name_to_input_buf, name_to_buf, graph_outputs)
+                order = method(
+                    nodes, name_to_freeable_input_buf, name_to_buf, graph_outputs
+                )
             else:
                 order = method(nodes)
             assert len(order) == len(nodes)
             peak_memory, _ = estimate_peak_memory(
-                order, name_to_input_buf, graph_outputs
+                order, name_to_freeable_input_buf, graph_outputs
             )
             peak_memory_diff_methods.append(
                 PeakMemoryResult(order, peak_memory, method.__name__)
@@ -566,4 +654,5 @@ class PeakMemoryResult:
 
     # get the optimal one
     best_result = min(peak_memory_diff_methods, key=lambda x: x.peak_memory)
+
     return best_result.order

From 447bb72822aacbb343c38c35fe4e97844f4e3919 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:35:27 +0000
Subject: [PATCH 068/161] Revert "[c10d][CI] Improve world size setting in some
 tests (#138846)"

This reverts commit 9c35e33d9b02e384f0d504f942a916e9e849b163.

Reverted https://github.com/pytorch/pytorch/pull/138846 on behalf of https://github.com/jeanschmidt due to introduced breaks in linux-focal-cuda11.8-py3.10-gcc9 ([comment](https://github.com/pytorch/pytorch/pull/138846#issuecomment-2438415315))
---
 test/distributed/test_c10d_object_collectives.py | 7 +++----
 test/distributed/test_c10d_ops_nccl.py           | 9 +--------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index dcd6de797e725..ece50ebe8890b 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -24,6 +24,7 @@
     sys.exit(0)
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
+WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
 
 
 def with_comms(func=None):
@@ -53,16 +54,14 @@ def setUp(self):
     @property
     def device(self):
         return (
-            torch.device("cuda", self.rank % torch.cuda.device_count())
+            torch.device(self.rank)
             if BACKEND == dist.Backend.NCCL
             else torch.device("cpu")
         )
 
     @property
     def world_size(self):
-        if BACKEND == dist.Backend.NCCL:
-            return torch.cuda.device_count()
-        return super().world_size
+        return WORLD_SIZE
 
     @property
     def process_group(self):
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index 1b5c4d98f1481..c9fb0f30b53f9 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -28,7 +28,6 @@
     init_multigpu_helper,
     MultiProcContinousTest,
     requires_nccl,
-    TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
     skip_but_pass_in_sandcastle_if,
@@ -980,14 +979,8 @@ def allgather_base(output_t, input_t):
 
 
 if __name__ == "__main__":
-    if not torch.cuda.is_available():
-        sys.exit(TEST_SKIPS["no_cuda"].exit_code)
-
     rank = int(os.getenv("RANK", -1))
-    world_size = int(os.getenv("WORLD_SIZE", -1))
-
-    if world_size == -1:  # Not set by external launcher
-        world_size = torch.cuda.device_count()
+    world_size = int(os.getenv("WORLD_SIZE", 2))
 
     if rank != -1:
         # Launched with torchrun or other multi-proc launchers. Directly run the test.

From 6f66398ab826706054ca4090ad82753287a067fd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 25 Oct 2024 17:37:30 +0000
Subject: [PATCH 069/161] Revert "[aotd] Unwrap unseen AsyncCollectiveTensor
 tangents (#138731)"

This reverts commit 245026af2d2f26c74993cb90e01bddbd627c6797.

Reverted https://github.com/pytorch/pytorch/pull/138731 on behalf of https://github.com/jeanschmidt due to introduced regressions on linux-focal-cuda12.1-py3.10-gcc9-bazel-test ([comment](https://github.com/pytorch/pytorch/pull/138731#issuecomment-2438417669))
---
 test/functorch/test_aotdispatch.py            | 30 -------------------
 .../_aot_autograd/runtime_wrappers.py         | 15 +---------
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index b1689e5afb2ef..3e1eeb8255b75 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -45,7 +45,6 @@
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.codecache import compiled_fx_graph_hash
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
-from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.fx.experimental.proxy_tensor import is_sym_node
 from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode, ShapeEnv
 from torch.nn.utils.rnn import PackedSequence
@@ -6185,35 +6184,6 @@ def fn(x):
         out_buffer = out.values()
         ga, gb, gc = torch.autograd.grad(out_buffer.sum(), (a, b, c))
 
-    def test_unwrap_async_collective_tensor_tangent(self):
-        def fn(x):
-            return x.clone()
-
-        ref_x = TwoTensor(
-            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
-        )
-        ref_y = fn(ref_x)
-        ref_y.backward(gradient=TwoTensor(torch.randn(2, 3), torch.randn(2, 3)))
-
-        fn_comp = torch.compile(fn, fullgraph=True)
-
-        x = TwoTensor(
-            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
-        )
-        y = fn_comp(x)
-        y.backward(gradient=TwoTensor(torch.randn(2, 3), torch.randn(2, 3)))
-
-        x2 = TwoTensor(
-            torch.randn(2, 3, requires_grad=True), torch.randn(2, 3, requires_grad=True)
-        )
-        y2 = fn_comp(x2)
-        y2.backward(
-            gradient=TwoTensor(
-                AsyncCollectiveTensor(torch.randn(2, 3)),
-                AsyncCollectiveTensor(torch.randn(2, 3)),
-            )
-        )
-
     @torch._inductor.config.patch({"freezing": True})
     def test_inductor_freezing_with_subclasses(self):
         class M(torch.nn.Module):
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 7ab1d41a30cc7..9e8a21321ad77 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -27,7 +27,6 @@
 )
 from torch._prims_common import CUDARngStateHelper
 from torch._subclasses import FakeTensor
-from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.fx.experimental._backward_state import BackwardState
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
@@ -1444,19 +1443,7 @@ def coerce_runtime_tangent_tracing_memory_format(x, memory_format):
             return x
 
         is_subclass: bool = is_traceable_wrapper_subclass(x)
-        mem_format = memory_format
-        if is_subclass:
-            memory_format_for_dense_tensor = not isinstance(memory_format, list)
-            if isinstance(x, AsyncCollectiveTensor) and memory_format_for_dense_tensor:
-                # This is AsyncCollectiveTensor, that we have not seen during tracing time.
-                while True:
-                    x = x.trigger_wait()
-                    # Checking recursive AsyncCollectiveTensor
-                    if not isinstance(x, AsyncCollectiveTensor):
-                        break
-                is_subclass = False
-            else:
-                mem_format = memory_format[0]
+        mem_format = memory_format[0] if is_subclass else memory_format
 
         if not x.is_contiguous(memory_format=mem_format):
             x = x.contiguous(memory_format=mem_format)

From 3d0aa6f04926d4914d69b09306027f2bb571700e Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Fri, 25 Oct 2024 17:40:58 +0000
Subject: [PATCH 070/161] Update readme with std::optional (#138914)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138914
Approved by: https://github.com/malfet
---
 aten/src/ATen/core/op_registration/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md
index 61b41b48c4a67..45b9bfa7b4199 100644
--- a/aten/src/ATen/core/op_registration/README.md
+++ b/aten/src/ATen/core/op_registration/README.md
@@ -140,7 +140,7 @@ Or with annotations:
 
 ```
 namespace {
-    Tensor my_kernel_cpu(const Tensor& a, int64_t b, at::optional<int64_t> c) {...}
+    Tensor my_kernel_cpu(const Tensor& a, int64_t b, std::optional<int64_t> c) {...}
 }
 
 static auto registry = torch::RegisterOperators()
@@ -176,7 +176,7 @@ The kernel function can take any of the following types as inputs or outputs:
 * `bool`
 * `c10::string_view`
 * `at::Scalar` (this is a type that can hold either an integer or a floating point value)
-* `at::optional<T>` with T being any type from the list above
+* `std::optional<T>` with T being any type from the list above
 
 The kernel function can take and return list inputs by using `torch::List<T>`. `T` must be one of the supported types from above excluding `at::Scalar`.
 

From 375d71cc5a5f51f22eab109a5cc5daecd31c463f Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@meta.com>
Date: Fri, 25 Oct 2024 17:56:14 +0000
Subject: [PATCH 071/161] plumb is_export flag to FunctionalTensorMode in
 analysis pass (#138836)

Summary: there is an issue with functionalization V2 in export. This is a quick fix that plumbs `is_export` through to `run_functionalized_fw_and_collect_metadata`.

Test Plan: CI

Differential Revision: D64915263

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138836
Approved by: https://github.com/tugsbayasgalan
---
 torch/_functorch/_aot_autograd/collect_metadata_analysis.py | 5 ++++-
 torch/_functorch/aot_autograd.py                            | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index d84321ddf4b07..7fa17ba2ff9dc 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -152,6 +152,9 @@ def run_functionalized_fw_and_collect_metadata(
     # Note: this is guaranteed to be set when running under dynamo
     static_input_indices: Optional[List[int]] = None,
     pre_dispatch: bool = False,
+    # is_export is technically only needed to avoid using functionalization V2
+    # during analysis
+    is_export: bool = False,
 ) -> Callable[..., ViewAndMutationMeta]:
     memo: Dict[Tensor, Tensor] = {}
 
@@ -183,7 +186,7 @@ def inner(*flat_args):
 
         # It doesn't matter if we run this under predispatch or not because it is
         # only for figuring out metadata
-        mode = FunctionalTensorMode(_allow_token_discovery=True)
+        mode = FunctionalTensorMode(_allow_token_discovery=True, export=is_export)
         suppress_pending = contextlib.nullcontext()
         fake_mode = detect_fake_mode()
         if fake_mode and (shape_env := fake_mode.shape_env):
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index a284689c0e72c..add969035f0bc 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -628,6 +628,7 @@ def _dup_fake_script_obj(fake_flat_args):
                         keep_input_mutations=aot_config.keep_inference_input_mutations,
                         is_train=needs_autograd,
                         pre_dispatch=aot_config.pre_dispatch,
+                        is_export=aot_config.is_export,
                     )(*_dup_fake_script_obj(fake_flat_args))
 
                 req_subclass_dispatch = requires_subclass_dispatch(

From a6287b5c274c71f72ac3845e9befec9578ad799e Mon Sep 17 00:00:00 2001
From: Gagan Jain <gaganj@meta.com>
Date: Fri, 25 Oct 2024 17:57:25 +0000
Subject: [PATCH 072/161] Fixing issue in move pass for copying Parameter
 (#138855)

Summary: Fixing bug for Parameter copy during move pass of exported graph.

Test Plan:
UT

runs on APS models.

Differential Revision: D64876951

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138855
Approved by: https://github.com/pianpwk

Co-authored-by: Gagan Jain <gaganj@meta.com>
---
 torch/export/passes/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index c523b954e88e7..57466bee49d0a 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -41,7 +41,8 @@ def _get_new_device(
     for k, v in ep.state_dict.items():
         if isinstance(v, torch.nn.Parameter):
             ep._state_dict[k] = torch.nn.Parameter(
-                v.to(_get_new_device(v.device, location))
+                v.to(_get_new_device(v.device, location)),
+                v.requires_grad,
             )
         else:
             ep._state_dict[k] = v.to(_get_new_device(v.device, location))

From 69af467d4fd37118cdfc212c88120c7a66fabd68 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Fri, 25 Oct 2024 17:58:59 +0000
Subject: [PATCH 073/161] Eliminate c10::value_or_else (#138818)

Test Plan: Sandcastle

Differential Revision: D64857418

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138818
Approved by: https://github.com/malfet, https://github.com/Skylion007
---
 .../ATen/functorch/PyTorchOperatorHacks.cpp   |  2 +-
 aten/src/ATen/native/Convolution.cpp          |  4 +-
 aten/src/ATen/native/Normalization.cpp        | 54 +++++++++----------
 aten/src/ATen/native/RNN.cpp                  | 16 +++---
 aten/src/ATen/native/cuda/Normalization.cu    | 16 +++---
 aten/src/ATen/native/cuda/RNN.cu              |  6 +--
 aten/src/ATen/native/cudnn/BatchNorm.cpp      | 12 ++---
 aten/src/ATen/native/cudnn/RNN.cpp            | 17 +++---
 aten/src/ATen/native/group_norm.cpp           |  6 +--
 .../ATen/native/miopen/BatchNorm_miopen.cpp   | 12 ++---
 aten/src/ATen/native/miopen/RNN_miopen.cpp    | 10 ++--
 aten/src/ATen/native/mkldnn/Normalization.cpp | 10 ++--
 aten/src/ATen/native/mkldnn/RNN.cpp           |  6 +--
 aten/src/ATen/native/mps/operations/RnnOps.mm |  6 +--
 .../native/quantized/cpu/Normalization.cpp    |  2 +-
 c10/core/TensorOptions.h                      |  6 +--
 c10/util/Optional.h                           | 13 ++++-
 torch/csrc/autograd/FunctionsManual.cpp       |  9 ++--
 18 files changed, 102 insertions(+), 105 deletions(-)

diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index 5e7508a7a429b..7bc3a3cbfe44a 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -88,7 +88,7 @@ Tensor binary_cross_entropy_with_logits_hack(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& pos_weight = c10::value_or_else(pos_weight_opt, [] {return Tensor();});
+  const Tensor& pos_weight = pos_weight_opt.value_or(Tensor());
 
   Tensor loss;
   auto max_val = (-input).clamp_min(0);
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 029218c5a1e75..b9354cd610a8a 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1732,8 +1732,8 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> ggI_maybe_owned = at::borrow_from_optional_tensor(ggI_opt);
   const Tensor& ggI = *ggI_maybe_owned;
-  const Tensor& ggW_r = c10::value_or_else(ggW_r_opt, [] {return Tensor();});
-  const Tensor& ggb = c10::value_or_else(ggb_opt, [] {return Tensor();});
+  const Tensor& ggW_r = ggW_r_opt.value_or(Tensor());
+  const Tensor& ggb = ggb_opt.value_or(Tensor());
 
 
   auto ggW = ggW_r;
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 411e6025085ed..8e50d93b0b1ef 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -549,9 +549,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   auto num_features = input.sym_sizes()[1];
 
@@ -631,10 +631,10 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_impl_index_backward(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
-  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
-  const Tensor& save_var_transform = c10::value_or_else(save_var_transform_opt, [] {return Tensor();});
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
+  const Tensor& save_mean = save_mean_opt.value_or(Tensor());
+  const Tensor& save_var_transform = save_var_transform_opt.value_or(Tensor());
 
   if (input.numel() == 0) {
     std::vector<int64_t> dims(input.dim() - 1);
@@ -675,10 +675,10 @@ Tensor batch_norm(
     const Tensor& input, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
     const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
     bool training, double momentum, double eps, bool cudnn_enabled) {
-  const Tensor& weight = c10::value_or_else(weight_opt, [] {return Tensor();});
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& weight = weight_opt.value_or(Tensor());
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
   return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var,
                                                 training, momentum, eps, cudnn_enabled));
   // TODO: switch to the new stack after the 2 week FC window
@@ -713,9 +713,9 @@ Tensor instance_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
  TORCH_CHECK(use_input_stats || (running_mean.defined() && running_var.defined()),
            "Expected running_mean and running_var to be defined when use_input_stats is false");
@@ -750,7 +750,7 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   const bool mixed_type = is_mixed_type(self, running_mean, running_var);
   return AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "batch_norm_update_stats_cpu", [&] {
@@ -769,9 +769,9 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, con
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   checkBackend("batch_norm_cpu_out", {self, weight, bias, running_mean, running_var}, Backend::CPU);
   // Resize out
@@ -812,9 +812,9 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const std:
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
 
@@ -879,8 +879,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_no_update(
     const Tensor& input, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
     const std::optional<Tensor>& running_mean_opt, const std::optional<Tensor>& running_var_opt,
     double momentum, double eps) {
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
   auto [output, save_mean, save_var] =
     batch_norm_cpu(input, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*update*/false, momentum, eps);
   Tensor reserve = at::empty({0}, input.options().dtype(kByte));
@@ -927,10 +927,10 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_ou
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
-  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
-  const Tensor& save_invstd = c10::value_or_else(save_invstd_opt, [] {return Tensor();});
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
+  const Tensor& save_mean = save_mean_opt.value_or(Tensor());
+  const Tensor& save_invstd = save_invstd_opt.value_or(Tensor());
 
   const bool mixed_type = is_mixed_type(self, weight, running_mean, running_var, save_mean, save_invstd);
   return AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "batch_norm_backward_cpu", [&] {
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 69a161b3e2286..00e3739539835 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -1529,7 +1529,7 @@ std::tuple<Tensor, Tensor> lstm_cell(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
-  const Tensor& b_hh = c10::value_or_else(b_hh_opt, [] {return Tensor();});
+  const Tensor& b_hh = b_hh_opt.value_or(Tensor());
 
   TORCH_CHECK(hx.size() == 2, "lstm_cell expects two hidden states");
   check_rnn_cell_forward_input(input, w_ih.sym_size(1));
@@ -1549,9 +1549,9 @@ _thnn_differentiable_lstm_cell_backward( const std::optional<Tensor>& grad_hy_op
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> grad_hy_maybe_owned = at::borrow_from_optional_tensor(grad_hy_opt);
   const Tensor& grad_hy = *grad_hy_maybe_owned;
-  const Tensor& grad_cy = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
-  const Tensor& input_bias = c10::value_or_else(input_bias_opt, [] {return Tensor();});
-  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+  const Tensor& grad_cy = grad_cy_opt.value_or(Tensor());
+  const Tensor& input_bias = input_bias_opt.value_or(Tensor());
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
 
   if (!grad_hy.defined() && !grad_cy.defined()) {
     return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>();
@@ -1603,7 +1603,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
-  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
 
   Tensor in_g = input_gates;
   Tensor h_g = hidden_gates;
@@ -1643,7 +1643,7 @@ Tensor gru_cell(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
-  const Tensor& b_hh = c10::value_or_else(b_hh_opt, [] {return Tensor();});
+  const Tensor& b_hh = b_hh_opt.value_or(Tensor());
 
   check_rnn_cell_forward_input(input, w_ih.size(1));
   check_rnn_cell_forward_hidden(input, hx, w_hh.size(1), 0);
@@ -1657,7 +1657,7 @@ Tensor rnn_tanh_cell(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
-  const Tensor& b_hh = c10::value_or_else(b_hh_opt, [] {return Tensor();});
+  const Tensor& b_hh = b_hh_opt.value_or(Tensor());
 
   static at::Tensor undefined;
   check_rnn_cell_forward_input(input, w_ih.size(1));
@@ -1671,7 +1671,7 @@ Tensor rnn_relu_cell(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> b_ih_maybe_owned = at::borrow_from_optional_tensor(b_ih_opt);
   const Tensor& b_ih = *b_ih_maybe_owned;
-  const Tensor& b_hh = c10::value_or_else(b_hh_opt, [] {return Tensor();});
+  const Tensor& b_hh = b_hh_opt.value_or(Tensor());
 
   static at::Tensor undefined;
   check_rnn_cell_forward_input(input, w_ih.size(1));
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index ae0908b3abac6..8db7241dee137 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -487,7 +487,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_cuda(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
   Tensor output, save_mean, save_var, reserve;
 
   BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
@@ -513,7 +513,7 @@ std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_cuda_out(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
 
   BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
   if (backend == BatchNormBackend::Cudnn) {
@@ -551,10 +551,10 @@ std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_cuda(
     const std::optional<Tensor>& save_mean_opt, const std::optional<Tensor>& save_var_opt,
     bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
   const Tensor& dummy_bias = at::empty(1);
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
-  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
-  const Tensor& save_var = c10::value_or_else(save_var_opt, [] {return Tensor();});
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
+  const Tensor& save_mean = save_mean_opt.value_or(Tensor());
+  const Tensor& save_var = save_var_opt.value_or(Tensor());
 
   BatchNormBackend backend = _select_batch_norm_backend(input, weight, dummy_bias, running_mean, running_var, /*training*/true, eps);
 
@@ -694,7 +694,7 @@ std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda(const Tensor& self, cons
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   std::vector<int64_t> counts(mean.size(0), count);
   Tensor counts_ = at::from_blob((void*)counts.data(), {(int64_t)counts.size()}, self.options().dtype(at::kLong).device(at::kCPU));
@@ -708,7 +708,7 @@ std::tuple<Tensor, Tensor> batch_norm_gather_stats_with_counts_cuda(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
   const Tensor& running_mean = *running_mean_maybe_owned;
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
 
   auto scalar_type = running_mean.defined() ? running_mean.scalar_type() : self.scalar_type();
diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu
index 3b10a836c409e..53dd49909b1a6 100644
--- a/aten/src/ATen/native/cuda/RNN.cu
+++ b/aten/src/ATen/native/cuda/RNN.cu
@@ -520,7 +520,7 @@ std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_cuda(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
-  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
 
   checkSizes("_thnn_fused_lstm_cell_cuda",
              {input_gates, "input_gates", 1}, {hidden_gates, "hidden_gates", 2},
@@ -570,7 +570,7 @@ std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_impl_cuda( con
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> grad_hy_maybe_owned = at::borrow_from_optional_tensor(grad_hy_opt);
   const Tensor& grad_hy = *grad_hy_maybe_owned;
-  const Tensor& grad_cy = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
+  const Tensor& grad_cy = grad_cy_opt.value_or(Tensor());
 
   if (!grad_hy.defined() && !grad_cy.defined()) {
     return std::tuple<Tensor, Tensor, Tensor>();
@@ -606,7 +606,7 @@ std::tuple<Tensor, Tensor> _thnn_fused_gru_cell_cuda(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
   const Tensor& input_bias = *input_bias_maybe_owned;
-  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+  const Tensor& hidden_bias = hidden_bias_opt.value_or(Tensor());
 
   checkSizes("_thnn_fused_gru_cell_cuda",
              {input_gates, "input_gates", 1}, {hidden_gates, "hidden_gates", 2},
diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp
index a3d9166a6a717..c9e2fb361297d 100644
--- a/aten/src/ATen/native/cudnn/BatchNorm.cpp
+++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp
@@ -133,10 +133,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> cudnn_batch_norm(
   c10::MaybeOwned<Tensor> bias_t_maybe_owned =
       at::borrow_from_optional_tensor(bias_t_opt);
   const Tensor& bias_t = *bias_t_maybe_owned;
-  const Tensor& running_mean_t =
-      c10::value_or_else(running_mean_t_opt, [] { return Tensor(); });
-  const Tensor& running_var_t =
-      c10::value_or_else(running_var_t_opt, [] { return Tensor(); });
+  const Tensor& running_mean_t = running_mean_t_opt.value_or(Tensor());
+  const Tensor& running_var_t = running_var_t_opt.value_or(Tensor());
 
   TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2},
       bias{bias_t, "bias", 3}, running_mean{running_mean_t, "running_mean", 4},
@@ -283,10 +281,8 @@ std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward(
     double epsilon,
     const Tensor& reserveSpace) {
   // See [Note: hacky wrapper removal for optional tensor]
-  const Tensor& save_mean_t =
-      c10::value_or_else(save_mean_t_opt, [] { return Tensor(); });
-  const Tensor& save_var_t =
-      c10::value_or_else(save_var_t_opt, [] { return Tensor(); });
+  const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
+  const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
 
   // TODO: Is it worth it to have a contiguous call or maybe we should go with
   // whatever format is given here.
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 744649e121f57..f6526acaa61f6 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -1402,9 +1402,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
   c10::MaybeOwned<Tensor> weight_buf_r_maybe_owned =
       at::borrow_from_optional_tensor(weight_buf_r_opt);
   const Tensor& weight_buf_r = *weight_buf_r_maybe_owned;
-  const Tensor& cx = c10::value_or_else(cx_opt, [] { return Tensor(); });
-  const Tensor& fn_dropout_state =
-      c10::value_or_else(fn_dropout_state_opt, [] { return Tensor(); });
+  const Tensor& cx = cx_opt.value_or(Tensor());
+  const Tensor& fn_dropout_state = fn_dropout_state_opt.value_or(Tensor());
 
   check_attributes(input_r, weight, {hx, cx}, /*check_dtype=*/true);
   auto input = input_r;
@@ -2115,14 +2114,10 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
   c10::MaybeOwned<Tensor> cx_maybe_owned =
       at::borrow_from_optional_tensor(cx_opt);
   const Tensor& cx = *cx_maybe_owned;
-  const Tensor& grad_output_r =
-      c10::value_or_else(grad_output_r_opt, [] { return Tensor(); });
-  const Tensor& grad_hy_r =
-      c10::value_or_else(grad_hy_r_opt, [] { return Tensor(); });
-  const Tensor& grad_cy_r =
-      c10::value_or_else(grad_cy_r_opt, [] { return Tensor(); });
-  const Tensor& dropout_state =
-      c10::value_or_else(dropout_state_opt, [] { return Tensor(); });
+  const Tensor& grad_output_r = grad_output_r_opt.value_or(Tensor());
+  const Tensor& grad_hy_r = grad_hy_r_opt.value_or(Tensor());
+  const Tensor& grad_cy_r = grad_cy_r_opt.value_or(Tensor());
+  const Tensor& dropout_state = dropout_state_opt.value_or(Tensor());
 
   if (!grad_output_r.defined() && !grad_hy_r.defined() &&
       !grad_cy_r.defined()) {
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 627fa71382e20..0971ddd3cf0df 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -72,7 +72,7 @@ std::tuple<Tensor, Tensor, Tensor> native_group_norm(
   c10::MaybeOwned<Tensor> gamma_maybe_owned =
       at::borrow_from_optional_tensor(gamma_opt);
   const Tensor& gamma = *gamma_maybe_owned;
-  const Tensor& beta = c10::value_or_else(beta_opt, [] { return Tensor(); });
+  const Tensor& beta = beta_opt.value_or(Tensor());
 
   // repeated check so expanded weights can call native_group_norm directly but
   // save mean and variance from forward
@@ -185,7 +185,7 @@ Tensor group_norm(
   c10::MaybeOwned<Tensor> weight_maybe_owned =
       at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
+  const Tensor& bias = bias_opt.value_or(Tensor());
 
   const auto N = input.sym_size(0);
   const auto C = input.sym_size(1);
@@ -224,7 +224,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> math_group_norm(
   c10::MaybeOwned<Tensor> weight_maybe_owned =
       at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
+  const Tensor& bias = bias_opt.value_or(Tensor());
 
   auto input_shape = input.sizes();
   at::Tensor input_reshaped = input.view({1, N * group, N ? -1 : 1});
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index 236f0747e145f..9002832fc3cc0 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -64,8 +64,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
   const Tensor& bias_t = *bias_t_maybe_owned;
-  const Tensor& running_mean_t = c10::value_or_else(running_mean_t_opt, [] {return Tensor();});
-  const Tensor& running_var_t = c10::value_or_else(running_var_t_opt, [] {return Tensor();});
+  const Tensor& running_mean_t = running_mean_t_opt.value_or(Tensor());
+  const Tensor& running_var_t = running_var_t_opt.value_or(Tensor());
 
   TensorArg input{ input_t, "input", 1 },
             weight{ weight_t, "weight", 2 },
@@ -169,13 +169,13 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     double epsilon) {
   // See [Note: hacky wrapper removal for optional tensor]
   const Tensor& running_mean =
-      c10::value_or_else(running_mean_opt, [] { return Tensor(); });
+      running_mean_opt.value_or(Tensor());
   const Tensor& running_var =
-      c10::value_or_else(running_var_opt, [] { return Tensor(); });
+      running_var_opt.value_or(Tensor());
   const Tensor& save_mean_t =
-      c10::value_or_else(save_mean_t_opt, [] { return Tensor(); });
+      save_mean_t_opt.value_or(Tensor());
   const Tensor& save_var_t =
-      c10::value_or_else(save_var_t_opt, [] { return Tensor(); });
+      save_var_t_opt.value_or(Tensor());
 
   TensorArg input{ input_t, "input", 1 },
             grad_output{ grad_output_t, "grad_output", 2 },
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index 5ae764376b267..e19243f70cdb4 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -452,7 +452,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> miopen_rnn(
     // See [Note: hacky wrapper removal for optional tensor]
     c10::MaybeOwned<Tensor> cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt);
     const Tensor& cx = *cx_maybe_owned;
-    const Tensor& fn_dropout_state = c10::value_or_else(fn_dropout_state_opt, [] {return Tensor();});
+    const Tensor& fn_dropout_state = fn_dropout_state_opt.value_or(Tensor());
 
     check_attributes(input_r, weight, {hx, cx});
     auto input = input_r;
@@ -766,10 +766,10 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> miopen_rnn_backward(
     // See [Note: hacky wrapper removal for optional tensor]
     c10::MaybeOwned<Tensor> cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt);
     const Tensor& cx = *cx_maybe_owned;
-    const Tensor& grad_output_r = c10::value_or_else(grad_output_r_opt, [] {return Tensor();});
-    const Tensor& grad_hy_r = c10::value_or_else(grad_hy_r_opt, [] {return Tensor();});
-    const Tensor& grad_cy_r = c10::value_or_else(grad_cy_r_opt, [] {return Tensor();});
-    const Tensor& dropout_state = c10::value_or_else(dropout_state_opt, [] {return Tensor();});
+    const Tensor& grad_output_r = grad_output_r_opt.value_or(Tensor());
+    const Tensor& grad_hy_r = grad_hy_r_opt.value_or(Tensor());
+    const Tensor& grad_cy_r = grad_cy_r_opt.value_or(Tensor());
+    const Tensor& dropout_state = dropout_state_opt.value_or(Tensor());
 
     if (!grad_output_r.defined() && !grad_hy_r.defined() && !grad_cy_r.defined()) {
         return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>(Tensor(), Tensor(), Tensor(), std::vector<Tensor>(weight.size()));
diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index dcc04f68e1848..88636a8b66b7c 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -138,9 +138,9 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
-  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
+  const Tensor& running_mean = running_mean_opt.value_or(Tensor());
+  const Tensor& running_var = running_var_opt.value_or(Tensor());
 
   if (input.scalar_type() == ScalarType::BFloat16) {
     TORCH_CHECK(mkldnn_bf16_device_check(),
@@ -253,8 +253,8 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(const Tensor& grad
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
-  const Tensor& save_invstd = c10::value_or_else(save_invstd_opt, [] {return Tensor();});
+  const Tensor& save_mean = save_mean_opt.value_or(Tensor());
+  const Tensor& save_invstd = save_invstd_opt.value_or(Tensor());
 
   TORCH_CHECK(train, "mkldnn_batch_norm_backward: currently mkldnn only support train model");
   ideep::tensor& grady = itensor_from_mkldnn(grad_output);
diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp
index c1c6e9cbdaa99..cbbae464c7d6a 100644
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@@ -315,9 +315,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
     at::IntArrayRef batch_sizes,
     bool batch_first,
     const at::Tensor& workspace) {
-  const Tensor& grad_output_r = c10::value_or_else(grad_output_r_opt, [] {return Tensor();});
-  const Tensor& grad_hy_r = c10::value_or_else(grad_hy_r_opt, [] {return Tensor();});
-  const Tensor& grad_cy_r = c10::value_or_else(grad_cy_r_opt, [] {return Tensor();});
+  const Tensor& grad_output_r = grad_output_r_opt.value_or(Tensor());
+  const Tensor& grad_hy_r = grad_hy_r_opt.value_or(Tensor());
+  const Tensor& grad_cy_r = grad_cy_r_opt.value_or(Tensor());
   if (!grad_output_r.defined() && !grad_hy_r.defined() && !grad_cy_r.defined()) {
       return std::make_tuple(Tensor(), Tensor(), Tensor(), Tensor(), Tensor(), Tensor(), Tensor());
   }
diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index fee33e951916c..4e46ea37bbadb 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -358,9 +358,9 @@
   using namespace mps;
   bool is_macos_14_4_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_4_PLUS);
 
-  const Tensor& grad_y_r = c10::value_or_else(grad_y_opt, [] { return Tensor(); });
-  const Tensor& grad_hy_r = c10::value_or_else(grad_hy_opt, [] { return Tensor(); });
-  const Tensor& grad_cy_r = c10::value_or_else(grad_cy_opt, [] { return Tensor(); });
+  const Tensor& grad_y_r = grad_y_opt.value_or(Tensor());
+  const Tensor& grad_hy_r = grad_hy_opt.value_or(Tensor());
+  const Tensor& grad_cy_r = grad_cy_opt.value_or(Tensor());
   const auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx[0], input.options());
   const auto grad_cy = grad_cy_r.defined() ? grad_cy_r : at::zeros_like(hx[1], input.options());
 
diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
index 846d712fbadc4..2fde5c954e782 100644
--- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
@@ -389,7 +389,7 @@ Tensor quantized_batch_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  const Tensor& bias = bias_opt.value_or(Tensor());
 
   Tensor qy;
   // TODO: this should arguably support 3d as well
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 2e05d8265edd8..d5412ecbad878 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -29,12 +29,12 @@ DispatchKey computeDispatchKey(
     std::optional<Device> device);
 
 inline ScalarType dtype_or_default(std::optional<ScalarType> dtype) {
-  return value_or_else(dtype, [] { return get_default_dtype_as_scalartype(); });
+  return dtype.value_or(get_default_dtype_as_scalartype());
 }
 
 inline caffe2::TypeMeta dtype_or_default(
     std::optional<caffe2::TypeMeta> dtype) {
-  return value_or_else(dtype, [] { return get_default_dtype(); });
+  return dtype.value_or(get_default_dtype());
 }
 
 inline Layout layout_or_default(std::optional<Layout> layout) {
@@ -42,7 +42,7 @@ inline Layout layout_or_default(std::optional<Layout> layout) {
 }
 
 inline Device device_or_default(std::optional<Device> device) {
-  return value_or_else(device, [] { return Device(kCPU); });
+  return device.value_or(Device(kCPU));
 }
 
 inline bool pinned_memory_or_default(std::optional<bool> pinned_memory) {
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index 1c62bc480e5f4..cbb3a5abb47d0 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -20,6 +20,8 @@ using std::nullopt_t;
 // NOLINTNEXTLINE(misc-unused-using-decls)
 using std::optional;
 
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+
 namespace detail_ {
 // the call to convert<A>(b) has return type A and converts b to type A iff b
 // decltype(b) is implicitly convertible to A
@@ -29,7 +31,9 @@ constexpr U convert(U v) {
 }
 } // namespace detail_
 template <class T, class F>
-constexpr T value_or_else(const std::optional<T>& v, F&& func) {
+[[deprecated(
+    "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T
+value_or_else(const std::optional<T>& v, F&& func) {
   static_assert(
       std::is_convertible_v<typename std::invoke_result_t<F>, T>,
       "func parameters must be a callable that returns a type convertible to the value stored in the optional");
@@ -37,12 +41,17 @@ constexpr T value_or_else(const std::optional<T>& v, F&& func) {
 }
 
 template <class T, class F>
-constexpr T value_or_else(std::optional<T>&& v, F&& func) {
+[[deprecated(
+    "Please use std::optional::value_or instead of c10::value_or_else")]] constexpr T
+value_or_else(std::optional<T>&& v, F&& func) {
   static_assert(
       std::is_convertible_v<typename std::invoke_result_t<F>, T>,
       "func parameters must be a callable that returns a type convertible to the value stored in the optional");
   return v.has_value() ? constexpr_move(std::move(v).contained_val())
                        : detail_::convert<T>(std::forward<F>(func)());
 }
+
+#endif
+
 } // namespace c10
 #endif // C10_UTIL_OPTIONAL_H_
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 862a376eb7f3a..00a856925db04 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -7046,12 +7046,9 @@ mkldnn_rnn_layer_differentiable_backward(
     at::IntArrayRef batch_sizes,
     bool batch_first,
     const at::Tensor& workspace) {
-  const Tensor& grad_output_r =
-      c10::value_or_else(grad_output_r_opt, [] { return Tensor(); });
-  const Tensor& grad_hy_r =
-      c10::value_or_else(grad_hy_r_opt, [] { return Tensor(); });
-  const Tensor& grad_cy_r =
-      c10::value_or_else(grad_cy_r_opt, [] { return Tensor(); });
+  const Tensor& grad_output_r = grad_output_r_opt.value_or(Tensor());
+  const Tensor& grad_hy_r = grad_hy_r_opt.value_or(Tensor());
+  const Tensor& grad_cy_r = grad_cy_r_opt.value_or(Tensor());
   if (!grad_output_r.defined() && !grad_hy_r.defined() &&
       !grad_cy_r.defined()) {
     return std::make_tuple(

From d0640b945b128ac16c8e559a87da82560ca02ba5 Mon Sep 17 00:00:00 2001
From: Adam Mainz <amainz@meta.com>
Date: Fri, 25 Oct 2024 17:59:24 +0000
Subject: [PATCH 074/161] [inductor][nit] removing unnecessary else statements
 (#138789)

Summary: while reading through inductor template code I found a few places where else statements were driving me crazy. Fixing them as I read

Test Plan: CI

Differential Revision: D64882385

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138789
Approved by: https://github.com/aakhundov
---
 torch/_inductor/codegen/common.py   | 91 ++++++++++++++---------------
 torch/_inductor/select_algorithm.py |  6 +-
 2 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index cd33a7ab46d64..1547a09a5486d 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -327,8 +327,7 @@ def get_wrapper_codegen_for_device(device: str, cpp_wrapper: bool = False):
             if cpp_wrapper
             else wrapper_codegen_obj.wrapper_codegen
         )
-    else:
-        return None
+    return None
 
 
 @functools.lru_cache(None)
@@ -678,8 +677,7 @@ def _print_Pow(self, expr):
         assert exp >= 0
         if exp > 0:
             return "*".join([self.paren(base)] * exp)
-        else:  # exp == 0
-            return "1"
+        return "1"
 
     # Explicit NotImplemented functions are to prevent default sympy printing
     # behavior, which will just barf out ToFloat(...) to your IR.  The error
@@ -2039,8 +2037,7 @@ def arg_to_bound(x):
 
                     arg_bounds = list(map(arg_to_bound, args))
                     return getattr(CSEProxy.vr_analysis, name)(*arg_bounds)
-                else:
-                    return ValueRanges.unknown()
+                return ValueRanges.unknown()
 
             @staticmethod
             def indirect_indexing(
@@ -2134,8 +2131,7 @@ def store(
                     CSEProxy._update_store_cache(name, value)
                 if name not in V.graph.removed_buffers:
                     return self.store(name, index, value, mode=mode)
-                else:
-                    return None  # type: ignore[return-value]
+                return None  # type: ignore[return-value]
 
             @staticmethod
             def store_reduction(name: str, index: sympy.Expr, value: CSEVariable):
@@ -2339,47 +2335,46 @@ def indent_except_first(source: str, num_indents: int, indents_spacing=4):
     @staticmethod
     def _template_from_string(source):
         env = jinja2_env()
-        if env is not None:
-            env.filters["indent_except_first"] = KernelTemplate.indent_except_first
-            from jinja2 import TemplateSyntaxError
-
-            class DetailedTemplateSyntaxError(TemplateSyntaxError):
-                def __init__(self, original_error):
-                    super().__init__(
-                        original_error.message,
-                        original_error.lineno,
-                        original_error.name,
-                        original_error.filename,
-                    )
-                    self.original_error = original_error
-
-                def __str__(self):
-                    error_info = f"Error in template at line {self.lineno}\n"
-                    error_info += f"Error message: {self.message}\n"
-                    if hasattr(self.original_error, "source"):
-                        lines = self.original_error.source.split("\n")
-                        error_info += "Context:\n"
-                        start = max(0, self.lineno - 2)
-                        end = min(len(lines), self.lineno + 2)
-                        for i in range(start, end):
-                            if i == self.lineno - 1:
-                                error_info += f"{i + 1}: --> {lines[i]}\n"
-                                if hasattr(self.original_error, "column"):
-                                    error_info += (
-                                        "     "
-                                        + " " * (self.original_error.column - 1)
-                                        + "^\n"
-                                    )
-                            else:
-                                error_info += f"{i + 1}:     {lines[i]}\n"
-                    return error_info
-
-            try:
-                return env.from_string(source)
-            except TemplateSyntaxError as e:
-                raise DetailedTemplateSyntaxError(e) from e
+        if env is None:
+            return None
+        env.filters["indent_except_first"] = KernelTemplate.indent_except_first
+        from jinja2 import TemplateSyntaxError
+
+        class DetailedTemplateSyntaxError(TemplateSyntaxError):
+            def __init__(self, original_error):
+                super().__init__(
+                    original_error.message,
+                    original_error.lineno,
+                    original_error.name,
+                    original_error.filename,
+                )
+                self.original_error = original_error
+
+            def __str__(self):
+                error_info = f"Error in template at line {self.lineno}\n"
+                error_info += f"Error message: {self.message}\n"
+                if hasattr(self.original_error, "source"):
+                    lines = self.original_error.source.split("\n")
+                    error_info += "Context:\n"
+                    start = max(0, self.lineno - 2)
+                    end = min(len(lines), self.lineno + 2)
+                    for i in range(start, end):
+                        if i == self.lineno - 1:
+                            error_info += f"{i + 1}: --> {lines[i]}\n"
+                            if hasattr(self.original_error, "column"):
+                                error_info += (
+                                    "     "
+                                    + " " * (self.original_error.column - 1)
+                                    + "^\n"
+                                )
+                        else:
+                            error_info += f"{i + 1}:     {lines[i]}\n"
+                return error_info
 
-        return None
+        try:
+            return env.from_string(source)
+        except TemplateSyntaxError as e:
+            raise DetailedTemplateSyntaxError(e) from e
 
     @staticmethod
     def _fake_get_dtype(fake_out):
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 48ff237c9876d..62c0133ddfb66 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -347,8 +347,7 @@ def stride(self, name, index=None):
 
         if isinstance(index, int):
             return texpr(self.rename_indexing(val[index]))
-        else:
-            return ", ".join([texpr(self.rename_indexing(i)) for i in val])
+        return ", ".join([texpr(self.rename_indexing(i)) for i in val])
 
     def modification(
         self, subgraph_number: int, output_name: str, **fixed_inputs
@@ -977,8 +976,7 @@ def to_callable(self):
         fn = self.choice.to_callable()
         if self.kwargs:
             return functools.partial(fn, **self.kwargs)
-        else:
-            return fn
+        return fn
 
     def hash_key(self):
         return "-".join(

From ba6526814a2b68b39338e0688084f051c6f164db Mon Sep 17 00:00:00 2001
From: Xinran / Allan Rui <arui@meta.com>
Date: Fri, 25 Oct 2024 18:00:28 +0000
Subject: [PATCH 075/161] Add dtype attribute to CSEVariable (#136778)

Summary:
- This diff introduces `dtype` attribute to `TritonCSEVariable` and a dtype propagation helper function to infer dtype from input to output for each op.

- There will be a follow-up diff that uses this `dtype` information in `TritonCSEVariable` to perform dtype-aware codegen.

Test Plan: CI

Differential Revision: D61815079

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136778
Approved by: https://github.com/eellison, https://github.com/blaine-rister
---
 test/inductor/test_torchinductor.py          |  23 +-
 torch/_inductor/codegen/common.py            | 230 ++++++++++++++++++-
 torch/_inductor/codegen/cpp_utils.py         |  10 +-
 torch/_inductor/codegen/halide.py            |  13 +-
 torch/_inductor/codegen/triton.py            | 100 +++++---
 torch/_inductor/codegen/triton_split_scan.py |  32 ++-
 6 files changed, 355 insertions(+), 53 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 89aeb545b270e..73687f41d95a8 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -12145,18 +12145,35 @@ def f(x, mask):
 
         @requires_gpu()
         @parametrize("upcast_to_fp32", [False, True])
+        @config.patch("triton.use_block_ptr", True)
         def test_codegen_upcast_to_fp32(self, upcast_to_fp32):
             @torch.compile
-            def func(a, b):
-                return a * b
+            def func(a, b, c, d):
+                return a * b * c * d
 
-            inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=torch.float16),) * 2
+            inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=torch.float16),) * 4
             with config.patch("triton.codegen_upcast_to_fp32", upcast_to_fp32):
                 func_opt = torch._dynamo.optimize("inductor")(func)
                 code = run_and_get_triton_code(func_opt, *inps)
                 fp32_cast_in_code = "to(tl.float32)" in code
                 self.assertEqual(fp32_cast_in_code, upcast_to_fp32)
 
+        @requires_gpu()
+        @parametrize("load_upcast_to_fp32", [False, True])
+        @parametrize("input_dtype", [torch.float16, torch.bfloat16])
+        @config.patch("triton.use_block_ptr", True)
+        def test_dtype_aware_codegen(self, load_upcast_to_fp32, input_dtype):
+            @torch.compile
+            def func(a, b, c, d):
+                return torch.sqrt(a * b * c * d)
+
+            inps = (torch.rand((32, 32), device=GPU_TYPE, dtype=input_dtype),) * 4
+            with config.patch("triton.codegen_upcast_to_fp32", load_upcast_to_fp32):
+                func_opt = torch._dynamo.optimize("inductor")(func)
+                code = run_and_get_triton_code(func_opt, *inps)
+                libdevice_cast_in_code = "libdevice.sqrt(tmp3.to(tl.float32))" in code
+                self.assertNotEqual(libdevice_cast_in_code, load_upcast_to_fp32)
+
         @config.patch("triton.use_block_ptr", False)
         def test_evict_last_non_coalesced_loads(self):
             @torch.compile
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 1547a09a5486d..2329cc1aba9ab 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1629,11 +1629,17 @@ class CSEVariable:
     See example of TritonCSEVariable in triton.py
     """
 
-    def __init__(self, name, bounds: ValueRanges[Any]):
+    def __init__(
+        self,
+        name,
+        bounds: ValueRanges[Any],
+        dtype: Optional[torch.dtype] = None,
+    ):
         assert isinstance(bounds, ValueRanges)
         self.name = name
         self.bounds = bounds
         self.use_count = 1  # track how many tims this expression is used
+        self.dtype = dtype
 
     def __str__(self):
         return self.name
@@ -1705,6 +1711,7 @@ def generate(
         bounds: ValueRanges[Any] = ValueRanges.unknown(),
         write=True,
         assignment=True,
+        dtype: Optional[torch.dtype] = None,
     ) -> CSEVariable:
         if isinstance(expr, OpsValue):
             expr = expr.value
@@ -1721,7 +1728,7 @@ def generate(
         cache_key = expr.getvalue() if isinstance(expr, IndentedBuffer) else expr
         var = self.cache.get(cache_key, None)
         if not var:
-            var = self.newvar(bounds)
+            var = self.newvar(bounds, dtype)
             self.cache[cache_key] = var
             if write:
                 if V.kernel.current_node:
@@ -1745,13 +1752,217 @@ def generate(
 
         return var
 
-    def newvar(self, bounds: ValueRanges[Any] = ValueRanges.unknown()) -> CSEVariable:
+    def newvar(
+        self,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+        dtype: Optional[torch.dtype] = None,
+    ) -> CSEVariable:
         var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
-        var = V.kernel.create_cse_var(var_name, bounds)
+        var = V.kernel.create_cse_var(var_name, bounds, dtype)
         self.varname_map[var_name] = var
         return var
 
 
+@functools.lru_cache(None)
+def get_promoted_dtype(*args, type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND):
+    def construct_input(inp):
+        if isinstance(inp, torch._prims_common.Number):
+            return inp
+        else:
+            assert hasattr(inp, "dtype")
+
+            # construct a tmp tensor to use dtype promotion util function
+            return torch.empty([1], dtype=inp.dtype)
+
+    inps = [construct_input(arg) for arg in args]
+    _, dtype = torch._prims_common.elementwise_dtypes(
+        *inps, type_promotion_kind=type_promotion_kind
+    )
+    return dtype
+
+
+def promote_types(args):
+    dtype_prop_candidates = []
+
+    # CSEVariable and scalar will be included in dtype_prop_candidates
+    for arg in args:
+        if isinstance(arg, str):
+            continue
+        elif (
+            isinstance(arg, OpsValue)
+            and isinstance(arg.value, CSEVariable)
+            and arg.value.dtype is not None
+        ):
+            dtype_prop_candidates.append(arg.value)
+        elif (isinstance(arg, CSEVariable) and arg.dtype is not None) or isinstance(
+            arg, torch._prims_common.Number
+        ):
+            dtype_prop_candidates.append(arg)  # type: ignore[arg-type]
+
+    dtype = get_promoted_dtype(
+        *dtype_prop_candidates,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    )
+
+    return dtype
+
+
+class DtypePropagationOpsHandler:
+    """
+    Propagate dtype from args to output
+    """
+
+    @staticmethod
+    def default_handler(*args):
+        # Fallback to FP32 dtype
+        return torch.float32
+
+    @staticmethod
+    def randint64(seed, offset, low, high):
+        return torch.int64
+
+    @staticmethod
+    def where(a, b, c):
+        return promote_types([b, c])
+
+    @staticmethod
+    def to_dtype_bitcast(x, dtype: torch.dtype, src_dtype: torch.dtype):
+        return dtype
+
+    @staticmethod
+    def load_seed(name, offset):
+        return torch.float32
+
+    @staticmethod
+    def masked(mask, body, other):
+        # TODO: inspect body to propagate dtype
+        return torch.float32
+
+    @staticmethod
+    def index_expr(expr, dtype):
+        return dtype
+
+    @staticmethod
+    def isnan(x):
+        return torch.bool
+
+    @staticmethod
+    def lt(a, b):
+        return torch.bool
+
+    @staticmethod
+    def to_dtype(x, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None):
+        return dtype
+
+    @staticmethod
+    def constant(value, dtype):
+        return dtype
+
+    @staticmethod
+    def mul(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def sub(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def add(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def div(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def abs(x):
+        return promote_types([x])
+
+    @staticmethod
+    def exp(x):
+        return promote_types([x])
+
+    @staticmethod
+    def truediv(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def pow(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def sqrt(x):
+        return promote_types([x])
+
+    @staticmethod
+    def rsqrt(x):
+        return promote_types([x])
+
+    @staticmethod
+    def sigmoid(x):
+        return promote_types([x])
+
+    @staticmethod
+    def gelu(x):
+        return promote_types([x])
+
+    @staticmethod
+    def neg(x):
+        return promote_types([x])
+
+    @staticmethod
+    def minimum(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def maximum(a, b):
+        return promote_types([a, b])
+
+    @staticmethod
+    def log(x):
+        return promote_types([x])
+
+    @staticmethod
+    def log1p(x):
+        return promote_types([x])
+
+    @staticmethod
+    def gt(a, b):
+        return torch.bool
+
+    @staticmethod
+    def ge(a, b):
+        return torch.bool
+
+    @staticmethod
+    def reciprocal(x):
+        return promote_types([x])
+
+    @staticmethod
+    def and_(a, b):
+        return torch.bool
+
+    @staticmethod
+    def bitwise_right_shift(a, b):
+        return a.dtype
+
+    @staticmethod
+    def bitwise_left_shift(a, b):
+        return a.dtype
+
+    @staticmethod
+    def sin(x):
+        return promote_types([x])
+
+    @staticmethod
+    def cos(x):
+        return promote_types([x])
+
+    @staticmethod
+    def mod(a, b):
+        return promote_types([a, b])
+
+
 class CodeGen:
     def __init__(self) -> None:
         super().__init__()
@@ -1987,8 +2198,17 @@ def inner(*args, **kwargs):
                     value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
 
                     def do_cse(v):
+                        output_dtype = getattr(
+                            DtypePropagationOpsHandler,
+                            name,
+                            DtypePropagationOpsHandler.default_handler,
+                        )(*args)
+
                         csevar = V.kernel.cse.generate(
-                            V.kernel.compute, v, bounds=bounds
+                            V.kernel.compute,
+                            v,
+                            bounds=bounds,
+                            dtype=output_dtype,
                         )
                         csevar.update_on_args(name, args, kwargs)
                         return csevar
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index 7c874980ceda6..6c15e76253b94 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -176,10 +176,14 @@ def deduce_dtype_for_cpp_cse_variable(name, *args, **kwargs):
 
 
 class CppCSEVariable(CSEVariable):
-    def __init__(self, name, bounds: ValueRanges[Any]) -> None:
-        super().__init__(name, bounds)
+    def __init__(
+        self,
+        name,
+        bounds: ValueRanges[Any],
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__(name, bounds, dtype)
         self.is_vec = False
-        self.dtype: Optional[torch.dtype] = None
         self.dependent_itervars: Set[sympy.Symbol] = set()
 
     def __repr__(self) -> str:
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index 033346b1696d6..27a043d785443 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -572,8 +572,13 @@ def _typecheck_HalideOverrides(h: HalideOverrides) -> OpsHandler[str]:
 class HalideCSEVariable(CSEVariable):
     undefined_re = re.compile(r"\b(tmp\d+)\[\?\]")
 
-    def __init__(self, name, bounds: ValueRanges[Any]) -> None:
-        super().__init__(name, bounds)
+    def __init__(
+        self,
+        name,
+        bounds: ValueRanges[Any],
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__(name, bounds, dtype)
         self.used_dims: Optional[List[sympy.Symbol]] = None
 
     def update_on_args(self, name, args, kwargs):
@@ -706,9 +711,9 @@ def __init__(
         self.buffer_aliases: Dict[str, List[str]] = defaultdict(list)
         self.has_indirect_indexing = False
 
-    def create_cse_var(self, name, bounds=None):
+    def create_cse_var(self, name, bounds=None, dtype=None):
         self.body.writeline(f"{name} = hl.Func({name!r})")
-        return HalideCSEVariable(name, bounds)
+        return HalideCSEVariable(name, bounds, dtype)
 
     def finalize_indexing(self, indices: Sequence[sympy.Expr]):
         """
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 92ee3709d6fa0..bae35bf3c2cbe 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -704,10 +704,11 @@ def triton_acc_type(dtype: torch.dtype) -> str:
 
 
 class TritonCSEVariable(CSEVariable):
-    def __init__(self, name, bounds: ValueRanges[Any]) -> None:
-        super().__init__(name, bounds)
+    def __init__(self, name, bounds: ValueRanges[Any], dtype: torch.dtype) -> None:
+        super().__init__(name, bounds, dtype)
         # We'll use this to track which masks the variable needs when used for indirect indexing
         self.mask_vars: OrderedSet[str] = OrderedSet()
+        assert dtype is not None, "TritonCSEVariable must have dtype"
 
     def update_on_args(self, name, args, kwargs):
         for arg in args:
@@ -849,7 +850,14 @@ def expm1(x):
 
     @staticmethod
     def sqrt(x):
-        return f"libdevice.sqrt({x})"
+        if config.triton.codegen_upcast_to_fp32:
+            return f"libdevice.sqrt({x})"
+        else:
+            needs_upcast = x.dtype in (torch.float16, torch.bfloat16)
+            orig_dtype = triton_type(x.dtype)
+            upcast_string = ".to(tl.float32)" if needs_upcast else ""
+            downcast_string = f".to({orig_dtype})" if needs_upcast else ""
+            return f"libdevice.sqrt({x}{upcast_string}){downcast_string}"
 
     @staticmethod
     def libdevice_sqrt(x):
@@ -1169,11 +1177,18 @@ def index_expr(cls, expr, dtype):
         indexing = V.kernel.indexing(expr, block_ptr=False)
         assert isinstance(indexing, IndexingOptions)
         var = V.kernel.cse.generate(
-            V.kernel.compute, indexing.index_str, bounds=get_bounds_index_expr(expr)
+            V.kernel.compute,
+            indexing.index_str,
+            bounds=get_bounds_index_expr(expr),
+            dtype=dtype,
         )
 
         if dtype not in (torch.int32, torch.int64):
-            var = V.kernel.cse.generate(V.kernel.compute, cls.to_dtype(var, dtype))
+            var = V.kernel.cse.generate(
+                V.kernel.compute,
+                cls.to_dtype(var, dtype),
+                dtype=dtype,
+            )
         var.mask_vars = indexing.mask_vars
         return var
 
@@ -1183,6 +1198,7 @@ def masked(mask, body, other):
             mask = V.kernel.cse.generate(
                 V.kernel.compute,
                 f"{mask}.to(tl.int1)",
+                dtype=torch.bool,
             )
 
         nodes = body.graph.find_nodes(op="output")
@@ -1207,6 +1223,7 @@ def masked(mask, body, other):
                 V.kernel.compute,
                 f"tl.full({result}.shape, {constant_repr(other)}, {result}.dtype)",
                 bounds=ValueRanges.wrap(other),
+                dtype=result.dtype,
             )
             ret = ops.where(new_mask, result, other)
         else:
@@ -1228,8 +1245,8 @@ def frexp(x):
         if cache_key in V.kernel.cse.cache:
             return V.kernel.cse.cache[cache_key]
 
-        mantissa = V.kernel.cse.newvar()
-        exponent = V.kernel.cse.newvar()
+        mantissa = V.kernel.cse.newvar(dtype=x.dtype)
+        exponent = V.kernel.cse.newvar(dtype=x.dtype)
         V.kernel.compute.writeline(
             f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
         )
@@ -1801,7 +1818,7 @@ def check_bounds(
             isinstance(m, TritonCSEVariable) for m in indexing.mask_vars
         )
         buffer = self.get_load_buffer(indexing)
-        self.cse.generate(buffer, line, assignment=False)
+        self.cse.generate(buffer, line, assignment=False, dtype=torch.int32)
 
     def get_load_buffer(self, indexing):
         if indexing.has_indirect() or indexing.has_tmpmask():
@@ -1869,6 +1886,8 @@ def load(self, name: str, index: sympy.Expr):
 
         advance_block_ptr = None
         append_broadcast = None
+        dtype = V.graph.get_dtype(name)
+
         if should_unwrap_unspec_arg(name):
             line = var
         else:
@@ -1887,26 +1906,27 @@ def load(self, name: str, index: sympy.Expr):
             else:
                 line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other})"
 
-            dtype = V.graph.get_dtype(name)
             if (
                 dtype in (torch.float16, torch.bfloat16)
                 and config.triton.codegen_upcast_to_fp32
             ):
                 line += ".to(tl.float32)"
+                dtype = torch.float32
             if dtype == torch.bool and torch.version.hip is None:
                 # Workaround for https://github.com/openai/triton/issues/2151
                 # tl.load returns int8 when loading from pointer to int1
                 # NOTE: Currently causes hangs on bool UTs for ROCm
                 line += ".to(tl.int1)"
+                dtype = torch.bool
 
         load_buffer = self.get_load_buffer(indexing)
-        result_var = self.cse.generate(load_buffer, line)
+        result_var = self.cse.generate(load_buffer, line, dtype=dtype)
         assert isinstance(result_var, TritonCSEVariable)
         result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
 
         if append_broadcast:
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
-            result_var = self.cse.generate(load_buffer, line)
+            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
 
         if advance_block_ptr:
             load_buffer.writeline(advance_block_ptr)
@@ -2004,6 +2024,7 @@ def bucketize(
             f"{sorter_indices}, "
             f"{block_size}, "
             ")",
+            dtype=values.dtype,  # type: ignore[attr-defined]
         )
 
         return result
@@ -2041,7 +2062,9 @@ def reduction(
         dense_size_str = self.dense_size_str()
         value = self._map_tuple_or_scalar(
             lambda v: self.cse.generate(
-                self.compute, f"tl.broadcast_to({v}, {dense_size_str})"
+                self.compute,
+                f"tl.broadcast_to({v}, {dense_size_str})",
+                dtype=v.dtype,
             ),
             value,
         )
@@ -2072,7 +2095,7 @@ def final_argreduce(buffer, result_var, value, index):
 
         dim = self.triton_tensor_ndim() - 1
         acc_type = triton_acc_type(src_dtype)
-        result_var: Any = self.cse.newvar()
+        result_var: Any = self.cse.newvar(dtype=dtype)
         result_var.mask_vars = OrderedSet(var for var in masks if var[0] != "r")
         cond = " & ".join(masks)
 
@@ -2086,7 +2109,9 @@ def where_cond(tval, fval):
             default = self._map_tuple_or_scalar(constant_repr, default)
 
             def _mask_value(value, default):
-                return self.cse.generate(self.compute, where_cond(value, default))
+                return self.cse.generate(
+                    self.compute, where_cond(value, default), dtype=value.dtype
+                )
 
             if isinstance(value, tuple):
                 masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
@@ -2098,6 +2123,7 @@ def _mask_value(value, default):
                     self.cse.generate(
                         self.compute,
                         f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
+                        dtype=torch.int64,
                     )
                 )
                 root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
@@ -2112,16 +2138,18 @@ def _mask_value(value, default):
             elif reduction_type == "welford_combine":
                 mean, m2, weight = masked_value
                 welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})"
-                mean, m2, weight = (self.cse.newvar() for _ in range(3))
+                mean, m2, weight = (self.cse.newvar(dtype=dtype) for _ in range(3))
                 self.compute.writeline(f"{mean}, {m2}, {weight} = {welford}")
 
                 result_var = tuple(
-                    self.cse.generate(self.compute, self.reduction_resize(var_name))
+                    self.cse.generate(
+                        self.compute, self.reduction_resize(var_name), dtype=dtype
+                    )
                     for var_name in (mean, m2, weight)
                 )
             else:
                 result_var = self.cse.generate(
-                    self.compute, final_reduction(masked_value)
+                    self.compute, final_reduction(masked_value), dtype=dtype
                 )
         else:
             accumulator = f"_{result_var}"
@@ -2193,8 +2221,8 @@ def _mask_value(value, default):
                 )
 
                 result_mean = result_var
-                result_m2 = self.cse.newvar()
-                result_weight = self.cse.newvar()
+                result_m2 = self.cse.newvar(dtype=dtype)
+                result_weight = self.cse.newvar(dtype=dtype)
                 self.suffix.splice(
                     f"""\
                 {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford(
@@ -2296,6 +2324,7 @@ def inner(*args, **kwargs):
                     return cse.generate(
                         helper,
                         getattr(overrides, name)(*args, **kwargs),
+                        dtype=torch.float32,
                     )
 
                 return inner
@@ -2336,10 +2365,12 @@ def scan(
             value_dtype = self.cse.generate(
                 self.compute,
                 f"{value}.to({triton_compute_type(dtype)})",
+                dtype=dtype,
             )
             value = self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({value_dtype}, {self.dense_size_str()})",
+                dtype=dtype,
             )
             broadcasted_values.append(value)
 
@@ -2347,7 +2378,7 @@ def scan(
             cond = " & ".join(masks)
 
             if not self.persistent_reduction:
-                accumulator = self.cse.newvar()
+                accumulator = self.cse.newvar(dtype=dtype)
                 reduced_size = self.dense_size_list()
                 reduced_size[-1] = "1"
                 reduced_size = f"[{', '.join(reduced_size)}]"
@@ -2362,11 +2393,12 @@ def scan(
         def csv(values):
             return " ".join(f"{value}," for value in values)
 
-        def cse_multiple(line, n, masks):
+        def cse_multiple(line, values, masks, dtypes):
+            n = len(values)
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
             if all(cache_key in self.cse.cache for cache_key in cache_keys):
                 return [self.cse.cache[cache_key] for cache_key in cache_keys]
-            result_vars = [self.cse.newvar() for _ in range(n)]
+            result_vars = [self.cse.newvar(dtype=_dtype) for _dtype in dtypes]
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -2378,8 +2410,9 @@ def cse_multiple(line, n, masks):
 
         partial_scan_vars = cse_multiple(
             f"tl.associative_scan(({csv(broadcasted_values)}), {dim}, {combine_helper_fn})",
-            len(values),
+            values,
             masks,
+            dtypes,
         )
 
         if not self.persistent_reduction:
@@ -2388,14 +2421,18 @@ def cse_multiple(line, n, masks):
             # last scan value
             partial_reduce_vars = [
                 cse_compute(
-                    f"triton_helpers.select_one(({partial_scan_var}), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)"
+                    f"triton_helpers.select_one(({partial_scan_var}), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)",
+                    dtype=partial_scan_var.dtype,
                 )
                 for partial_scan_var in partial_scan_vars
             ]
             accs_next = combine_fn(tuple(accumulators), tuple(partial_reduce_vars))
             full_scan_vars = combine_fn(tuple(accumulators), partial_scan_vars)
             result_vars = [
-                cse_compute(f"tl.where(roffset > 0, {full_scan}, {partial_scan})")
+                cse_compute(
+                    f"tl.where(roffset > 0, {full_scan}, {partial_scan})",
+                    dtype=partial_scan.dtype,
+                )
                 for full_scan, partial_scan in zip(full_scan_vars, partial_scan_vars)
             ]
             for acc_next, accumulator, partial_reduce in zip(
@@ -2432,19 +2469,22 @@ def sort(
         cse_compute = functools.partial(self.cse.generate, self.compute)
         dim = self.triton_tensor_ndim() - 1
 
+        assert len(dtypes) == len(values)
         broadcasted_values = [
-            cse_compute(f"tl.broadcast_to({value}, {self.dense_size_str()})")
-            for value in values
+            cse_compute(
+                f"tl.broadcast_to({value}, {self.dense_size_str()})", dtype=dtypes[i]
+            )
+            for i, value in enumerate(values)
         ]
 
         def csv(values):
             return " ".join(f"{value}," for value in values)
 
-        def cse_multiple(line, n, masks):
+        def cse_multiple(line, n, masks, dtypes):
             cache_keys = [f"{line}, {i}, {masks}" for i in range(n)]
             if all(cache_key in self.cse.cache for cache_key in cache_keys):
                 return [self.cse.cache[cache_key] for cache_key in cache_keys]
-            result_vars = [self.cse.newvar() for _ in range(n)]
+            result_vars = [self.cse.newvar(dtype=dtypes[i]) for i in range(n)]  # type: ignore[attr-defined]
             self.compute.writeline(
                 f"{csv(result_vars)} = {line}",
             )
@@ -2462,7 +2502,7 @@ def cse_multiple(line, n, masks):
                 f"triton_helpers.sort_with_index({broadcasted_values[0]}, {broadcasted_values[1]},"
                 f" {rnumel}, {dim}, stable={stable}, descending={descending})"
             )
-            result_vars = cse_multiple(line, len(values), masks)
+            result_vars = cse_multiple(line, len(values), masks, dtypes)
         else:
             raise AssertionError("Unhandled sort")
 
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
index e93c17a81f33a..3ffe313aec4da 100644
--- a/torch/_inductor/codegen/triton_split_scan.py
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -97,7 +97,7 @@ def scan(self, dtypes, combine_fn, values):
             scratch_type_triton.primitive_bitwidth // 8
         )
 
-        cse_load = functools.partial(self.cse.generate, self.loads)
+        cse_load = functools.partial(self.cse.generate, self.loads, dtype=dtype)
         cse_compute = functools.partial(self.cse.generate, self.compute)
 
         assert len(self.numels) == 2, "Unexpected tiling"
@@ -117,15 +117,26 @@ def scan(self, dtypes, combine_fn, values):
         self.filter_masks(masks)
         assert not self._load_mask, "ops.scan not supported inside ops.masked"
 
-        value = cse_compute(f"{value}.to({compute_type})")
-        value = cse_compute(f"tl.broadcast_to({value}, {self.dense_size_str()})")
+        value = cse_compute(
+            f"{value}.to({compute_type})",
+            dtype=dtype,
+        )
+        value = cse_compute(
+            f"tl.broadcast_to({value}, {self.dense_size_str()})",
+            dtype=dtype,
+        )
 
         combine_helper_fn = self._lift_helper(combine_fn, 1)
         dim = self.triton_tensor_ndim() - 1
         assert dim == 0, ""
 
-        block_sum = cse_compute(f"tl.reduce({value}, {dim}, {combine_helper_fn})")
-        exclusive_prefix = self.cse.newvar()
+        block_sum = cse_compute(
+            f"tl.reduce({value}, {dim}, {combine_helper_fn})",
+            dtype=dtype,
+        )
+        exclusive_prefix = self.cse.newvar(
+            dtype=dtype,
+        )
         if element_nbits == 64:
             self.compute.splice(
                 f"""
@@ -158,13 +169,18 @@ def scan(self, dtypes, combine_fn, values):
             )
         # Compute final cumsum
         block_scan = cse_compute(
-            f"tl.associative_scan({value}, {dim}, {combine_helper_fn})"
+            f"tl.associative_scan({value}, {dim}, {combine_helper_fn})",
+            dtype=dtype,
         )
         combined_result = cse_compute(
-            f"{combine_helper_fn}({exclusive_prefix}, {block_scan})"
+            f"{combine_helper_fn}({exclusive_prefix}, {block_scan})",
+            dtype=dtype,
         )
         return (
-            cse_compute(f"tl.where(roffset == 0, {block_scan}, {combined_result})"),
+            cse_compute(
+                f"tl.where(roffset == 0, {block_scan}, {combined_result})",
+                dtype=dtype,
+            ),
         )
 
     def _get_heuristic(self):

From 77587f43d2f6a1e99464cd1ae0317eaef04bd09d Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 25 Oct 2024 18:09:50 +0000
Subject: [PATCH 076/161] Add one more shard for CPU pull jobs (#138894)

The first shard is close to 3.5 hours and timing out flakily in trunk now, for example https://github.com/pytorch/pytorch/actions/runs/11509141659/job/32039126506.  So, I think we could just add one more shard in the same spirit as https://github.com/pytorch/pytorch/pull/137433
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138894
Approved by: https://github.com/Skylion007
---
 .github/workflows/pull.yml | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6171fc5c0aa35..206c350aa42fb 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -54,10 +54,11 @@ jobs:
       docker-image-name: pytorch-linux-jammy-py3.9-gcc11
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -186,10 +187,11 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3.9-clang10
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -218,10 +220,11 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3.11-clang10
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -252,10 +255,11 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3.12-clang10
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
           { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },

From dbbdfd9df583725c320ebf9aab0aca6d2fc357f6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Wed, 16 Oct 2024 14:01:24 -0700
Subject: [PATCH 077/161] Add pytorch.wait_counter.dynamo_compile (#138072)

I was discussing with James March how the current fx_codegen_and_compile
counter doesn't actually capture all compile time.  This one is more
accurate and corresponds closely to the existing events in dynamo_compile
table.

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138072
Approved by: https://github.com/markkm
---
 torch/_dynamo/convert_frame.py | 17 ++++++++++++++---
 torch/_inductor/compile_fx.py  |  6 ++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 64d884ff33e5b..42ddc36ad642e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -45,6 +45,7 @@
     GuardOnDataDependentSymNode,
 )
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
+from torch.monitor import _WaitCounter
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.utils._python_dispatch import (
     _disable_current_modes,
@@ -690,9 +691,19 @@ def compile_inner(
         hooks: Hooks,
         transform: Callable[[List[Instruction], Dict[str, Any]], Any],
     ) -> Optional[GuardedCode]:
-        with dynamo_timed("_compile.compile_inner", phase_name="entire_frame_compile"):
-            with CompileTimeInstructionCounter.record():
-                return _compile_inner(code, one_graph, hooks, transform)
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(
+                dynamo_timed(
+                    "_compile.compile_inner", phase_name="entire_frame_compile"
+                )
+            )
+            stack.enter_context(
+                _WaitCounter("pytorch.wait_counter.dynamo_compile").guard()
+            )
+            stack.enter_context(CompileTimeInstructionCounter.record())
+            return _compile_inner(code, one_graph, hooks, transform)
+
+        return None  # dead, but see https://github.com/python/mypy/issues/7577
 
     @compile_time_strobelight_meta(phase_name="compile_inner")
     @maybe_cprofile
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 2a167231b5e5d..4af7b796091ea 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -542,6 +542,12 @@ def compile_fx_inner(
                 "compile_fx_inner", phase_name="inductor_compile", fwd_only=False
             )
         )
+        # NB: Why is this the dynamo_compile counter?  The rule here is that
+        # if it gets an entry in the dynamo_compile table, we also want to
+        # tick up the wait counter.  We have to displeasingly manually trigger
+        # the counter here because we may dropped into compile_fx directly
+        # from lazy backwards compilation.
+        stack.enter_context(_WaitCounter("pytorch.wait_counter.dynamo_compile").guard())
         stack.enter_context(with_fresh_cache_if_config())
         stack.enter_context(DebugContext())
 

From 03f9136870ee18a4510338c287f0a947a86687ec Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Thu, 24 Oct 2024 20:26:27 -0700
Subject: [PATCH 078/161] Add wait counter on cuda::device_synchronize
 (#138883)

The wait counter is typically only minute precision, but if there is a collective in the queue it will show up. We think this explains up to eight minutes of delay in some compile traces we're looking at, but the counter would definitively prove it.

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Differential Revision: [D64944970](https://our.internmc.facebook.com/intern/diff/D64944970)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138883
Approved by: https://github.com/eqy
---
 c10/cuda/CUDAFunctions.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 6dc568e12328e..b1d573b16d1c4 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -1,5 +1,6 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/WaitCounter.h>
 
 #include <limits>
 
@@ -138,6 +139,7 @@ void device_synchronize() {
   if (C10_UNLIKELY(interp)) {
     (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
   }
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.cuda_device_synchronize);
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 }
 

From de54246c422a2057befaff449e30a22b4444075e Mon Sep 17 00:00:00 2001
From: Colin <486199+c00w@users.noreply.github.com>
Date: Fri, 25 Oct 2024 18:47:44 +0000
Subject: [PATCH 079/161] Recomend pip install -r requirements in the unit
 testing guidelines. (#137797)

Somehow make setup-env as recomended in CONTRIBUTING.MD is not installing all dependencies require to run tests

This makes it slightly clearer when running tests.

Specific repro on my side was
```
git checkout e7679663070e3149ae7cd6e28d376d86852ce9e4
make setup-env
conda activate pytorch-deps
python test/test_utils_internal.py
```

which is what my reading of the instructions implies should be correct.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137797
Approved by: https://github.com/albanD
---
 CONTRIBUTING.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f71f1f98ad6c3..c2eab67762074 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -286,6 +286,11 @@ The following packages should be installed with either `conda` or `pip`:
 - `expecttest` and `hypothesis` - required to run tests
 - `mypy` - recommended for linting
 - `pytest` - recommended to run tests more selectively
+Running
+```
+pip install -r requirements
+```
+will install these dependencies for you.
 
 All PyTorch test suites are located in the `test` folder and start with
 `test_`. Run the entire test

From 07dbc428812f790a59116d7add79655aa244d0de Mon Sep 17 00:00:00 2001
From: chilli <chilli@meta.com>
Date: Thu, 24 Oct 2024 19:08:22 -0700
Subject: [PATCH 080/161] Stop force realizing to prevent recursion errors
 unless it's much bigger (#138881)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138881
Approved by: https://github.com/shunting314
ghstack dependencies: #138733, #138794
---
 test/dynamo/test_repros.py |  2 +-
 test/inductor/test_perf.py | 46 ++++++++++++++++++++++++++++++++++++++
 torch/_inductor/graph.py   |  2 +-
 torch/_inductor/ir.py      |  7 ++++--
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index e380939570cea..fc922b39fb11b 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4169,7 +4169,7 @@ def fn(x):
 
     def test_inductor_no_recursionerror_on_for_loops(self):
         def forward(x):
-            for _ in range(1000):
+            for _ in range(10000):
                 x = 1.0 * x
             return x
 
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index c7c341d9165a4..87d8e383bd58a 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -533,6 +533,52 @@ def f(x, scale, amax_keep_dim):
         self.assertEqual(actual_numel_amax_keep_dim, actual_numel_amax_no_keep_dim)
         self.assertGreaterAlmostEqual(actual_numel_amax_keep_dim, str(expected_numel))
 
+    def test_create_block_mask(self):
+        def mk_3d_flex_natten_mask(dims, kernel_size):
+            T, H, W = dims
+            K_T, K_H, K_W = kernel_size
+            spatial = H * W
+
+            def get_x_y_t(idx: int) -> tuple[int, int, int]:
+                t = idx // spatial
+                s = idx % spatial
+                x = s // W
+                y = s % W
+                return x, y, t
+
+            def get_mask(b, h, q_idx, kv_idx):
+                q_x, q_y, q_t = get_x_y_t(q_idx)
+                kv_x, kv_y, kv_t = get_x_y_t(kv_idx)
+                kernel_x = q_x.clamp(K_W // 2, (W - 1) - K_W // 2)
+                kernel_y = q_y.clamp(K_H // 2, (H - 1) - K_H // 2)
+                kernel_t = q_t.clamp(K_T // 2, (T - 1) - K_T // 2)
+                hori_mask = (kernel_x - kv_x).abs() <= K_W // 2
+                vert_mask = (kernel_y - kv_y).abs() <= K_H // 2
+                temp_mask = (kernel_t - kv_t).abs() <= K_T // 2
+                return hori_mask & vert_mask & temp_mask
+
+            return get_mask
+
+        T = 4
+        H = 16
+        W = 16
+        t = 5
+        h = 5
+        w = 5
+        data_size = (T, H, W)
+        kernel_size = (t, h, w)
+        S = T * H * W
+        from torch.nn.attention.flex_attention import create_block_mask
+
+        mask_mod = mk_3d_flex_natten_mask(data_size, kernel_size)
+
+        torch.compile(create_block_mask)(mask_mod, None, None, S, S)
+        numel = int(count_numel(create_block_mask, mask_mod, None, None, S, S))
+
+        # We should be writing way less than a quadratic amount of bytes here
+        # With fusion, we should only be writing a linear number of bytes
+        self.assertLess(numel * 5, S * S)
+
 
 class SchedulerFusionTests(TestCase):
     """
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 1db6df67f5147..5f2848a98d1ce 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -1576,7 +1576,7 @@ def debug(msg: str) -> None:
                 curr = result.data.data
                 if isinstance(curr, Pointwise):
                     # Use inner fn as a rough proxy. Good enough.
-                    if curr.has_large_inner_fn():
+                    if curr.has_large_inner_fn(threshold=100):
                         result.realize()
 
         # This is not complete, but it doesn't have to be: origin_node
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 53d06748ff2d8..695b94b27a737 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -599,8 +599,11 @@ def inner_fn_str(self):
             self.inner_fn, *self.inner_fn_args()
         )
 
-    def has_large_inner_fn(self):
-        return self.inner_fn_opcount().num_ops > config.realize_opcount_threshold
+    def has_large_inner_fn(self, threshold=None):
+        if threshold is None:
+            threshold = 0
+        threshold = max(threshold, config.realize_opcount_threshold)
+        return self.inner_fn_opcount().num_ops > threshold
 
     def inner_fn_free_unbacked_symbols(self):
         index = self._index(self.ranges)

From 392221b39036bb140e32f428fc6e318744005651 Mon Sep 17 00:00:00 2001
From: chilli <chilli@meta.com>
Date: Thu, 24 Oct 2024 19:08:22 -0700
Subject: [PATCH 081/161] Made DDPOptimizer work with HOPs (#138787)

Fixes https://github.com/pytorch/pytorch/issues/137481

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138787
Approved by: https://github.com/yf225
ghstack dependencies: #138733, #138794, #138881
---
 test/distributed/test_dynamo_distributed.py | 119 +++++++++++++++++++-
 torch/_dynamo/backends/distributed.py       |  17 ---
 torch/_export/utils.py                      |   2 +
 torch/fx/passes/split_module.py             |  57 ++++++----
 4 files changed, 154 insertions(+), 41 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 078f319adaf7f..5394a515aad33 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -32,6 +32,7 @@
     lambda_auto_wrap_policy,
     transformer_auto_wrap_policy,
 )
+from torch.nn.attention.flex_attention import flex_attention
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -1293,6 +1294,118 @@ def opt_fn(inputs):
                 self.assertEqual(len(break_reasons), 4)
                 self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons))
 
+    @patch.object(config, "optimize_ddp", True)
+    def test_compiled_flex_attention_full_model_ddp(self):
+        class Model(torch.nn.Module):
+            def __init__(self, S, H, D):
+                super().__init__()
+
+                self.S = S
+                self.H = H
+                self.D = D
+
+                alibi_bias = self.generate_alibi_bias(H)
+                self.register_buffer("alibi_bias", alibi_bias, persistent=True)
+                self.attention = flex_attention
+
+                self.project_qk = torch.nn.Linear(H * D, H * D * 2)
+                self.project_v = torch.nn.Linear(H * D, H * D)
+
+            def forward(self, hidden_states):
+                batch_size, _, _ = hidden_states.size()
+
+                query, key = self.project_qk(hidden_states).chunk(2, dim=2)
+                query = query.view(self.S, batch_size, self.H, self.D)
+                query = query.permute(1, 2, 0, 3)
+
+                key = key.view(self.S, batch_size, self.H, self.D)
+                key = key.permute(1, 2, 0, 3)
+
+                value = self.project_v(hidden_states)
+                value = value.view(self.S, batch_size, self.H, self.D)
+                value = value.permute(1, 2, 0, 3)
+
+                return self.attention(query, key, value, score_mod=self.alibi_score_mod)
+
+            def generate_alibi_bias(self, num_heads):
+                alibi_bias = [-((i + 1) * 8.0) / num_heads for i in range(num_heads)]
+                return torch.tensor(alibi_bias)
+
+            def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
+                bias = (q_idx - kv_idx) * self.alibi_bias[h]
+                return score + bias
+
+        B = 16
+        H = 12
+        S = 512
+        D = 64
+
+        device = "cuda"
+        model = Model(S, H, D)
+        model.to(device)
+        model = torch.compile(model)
+        model = DDP(model, device_ids=self.device_ids)
+
+        hidden_states = torch.randn(B, S, H * D).to(device)
+        attention_scores = model(hidden_states)
+        torch.cuda.synchronize()
+
+    @patch.object(config, "optimize_ddp", True)
+    def test_compiled_flex_attention_local_ddp(self):
+        class Model(torch.nn.Module):
+            def __init__(self, S, H, D):
+                super().__init__()
+
+                self.S = S
+                self.H = H
+                self.D = D
+
+                alibi_bias = self.generate_alibi_bias(H)
+                self.register_buffer("alibi_bias", alibi_bias, persistent=True)
+                self.attention = torch.compile(flex_attention)
+
+                self.project_qk = torch.nn.Linear(H * D, H * D * 2)
+                self.project_v = torch.nn.Linear(H * D, H * D)
+
+            def forward(self, hidden_states):
+                batch_size, _, _ = hidden_states.size()
+
+                query, key = self.project_qk(hidden_states).chunk(2, dim=2)
+                query = query.view(self.S, batch_size, self.H, self.D)
+                query = query.permute(1, 2, 0, 3)
+
+                key = key.view(self.S, batch_size, self.H, self.D)
+                key = key.permute(1, 2, 0, 3)
+
+                value = self.project_v(hidden_states)
+                value = value.view(self.S, batch_size, self.H, self.D)
+                value = value.permute(1, 2, 0, 3)
+
+                return self.attention(query, key, value, score_mod=self.alibi_score_mod)
+
+            def generate_alibi_bias(self, num_heads):
+                alibi_bias = [-((i + 1) * 8.0) / num_heads for i in range(num_heads)]
+                return torch.tensor(alibi_bias)
+
+            def alibi_score_mod(self, score, b, h, q_idx, kv_idx):
+                bias = (q_idx - kv_idx) * self.alibi_bias[h]
+                return score + bias
+
+        B = 16
+        H = 12
+        S = 512
+        D = 64
+
+        device = "cuda"
+        model = Model(S, H, D)
+        model.to(device)
+        model = torch.compile(model)
+        model = DDP(model, device_ids=self.device_ids)
+
+        hidden_states = torch.randn(B, S, H * D).to(device)
+        attention_scores = model(hidden_states)
+        torch.cuda.synchronize()
+
     @patch.object(config, "optimize_ddp", True)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_graph_split_inductor(self):
@@ -1548,11 +1661,7 @@ def forward(self, x):
         backend = "aot_eager"
         cnt = torch._dynamo.testing.CompileCounterWithBackend(backend)
 
-        with self.assertRaisesRegex(
-            torch._dynamo.exc.BackendCompilerFailed,
-            "DDPOptimizer backend: Found a higher order op in the graph",
-        ):
-            torch.compile(mod, backend=cnt)(*args)
+        torch.compile(mod, backend=cnt)(*args)
 
     def test_fsdp_orig_params_assert(self):
         # Test with basic FSDP wrapping (outer wrap around whole model)
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index 3b79d1e68cf8a..bb35a9117daa6 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -413,23 +413,6 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
         and returns its callable.
         """
-        if has_higher_order_op(gm):
-            # This indicates presence of a higher order op. For now, we
-            # have no way to break the higher order op into two buckets.
-            # Allowing higher order ops in the graph also requires
-            # changes in the split_module, becuase graph splitter
-            # currently assumes that all the args of all ops are
-            # tensors, but in the case of higher order ops, it could be
-            # a graph module. As a workaround, we are shortcircuiting
-            raise NotImplementedError(
-                "DDPOptimizer backend: Found a higher order op in the graph. "
-                "This is not supported. Please turn off DDP optimizer using "
-                "torch._dynamo.config.optimize_ddp=False. Note that this can "
-                "cause performance degradation because there will be one bucket "
-                "for the entire Dynamo graph. Please refer to this issue - "
-                "https://github.com/pytorch/pytorch/issues/104674."
-            )
-
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
         processed_modules = set()
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index ad0bfee45258a..a34aea5519a0a 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -634,6 +634,8 @@ def node_inline_(call_mod_node: torch.fx.Node) -> None:
     with gm.graph.inserting_before(call_mod_node):
         for node in body:
             new_node = gm.graph.node_copy(node)
+            if node.op == "get_attr":
+                setattr(gm, node.target, getattr(sub_gm, node.target))
             node_replace_(node, new_node)
 
         if len(output) > 0:
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index b4437af5323ed..7df05aac83fa1 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -40,6 +40,15 @@ def __repr__(self) -> str:
         )
 
 
+def _get_attr_from_qualname(mod: torch.nn.Module, qualname: str) -> Any:
+    attr_val = mod
+    for atom in qualname.split("."):  # type: ignore[union-attr]
+        if not hasattr(attr_val, atom):
+            raise AttributeError(f"Node target {qualname} not found!")
+        attr_val = getattr(attr_val, atom)
+    return attr_val
+
+
 # Creates subgraphs out of main graph
 @compatibility(is_backward_compatible=True)
 def split_module(
@@ -179,11 +188,8 @@ def construct_graph(
         elif node.op == "get_attr":
             base_mod_env[node.name] = base_mod_graph.get_attr(node.target)  # type: ignore[arg-type]
             base_mod_env[node.name].meta = node.meta.copy()
-            attr_val = m
-            for atom in node.target.split("."):  # type: ignore[union-attr]
-                if not hasattr(attr_val, atom):
-                    raise AttributeError(f"Node target {node.target} not found!")
-                attr_val = getattr(attr_val, atom)
+            assert isinstance(node.target, str)
+            attr_val = _get_attr_from_qualname(m, node.target)
             base_mod_attrs[node.target] = attr_val  # type: ignore[index]
         return base_mod_env, base_mod_attrs
 
@@ -412,13 +418,34 @@ def instantiate_node_partition_mapping(node):
     # add placeholders to partition inputs
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
+        new_inputs: Dict[str, None] = {}
         for inp in partition.inputs:
-            placeholder = partition.graph.placeholder(
-                inp,
-                type_expr=orig_nodes[inp].type,
-            )
+            orig_node = orig_nodes[inp]
+            # We don't pass in get_attr nodes as inputs to the partition, but
+            # instead set them as targets and use getattr within the module
+
+            if orig_node.op == "get_attr":
+                assert isinstance(orig_node.target, str)
+
+                orig_attr = _get_attr_from_qualname(m, orig_node.target)
+                if isinstance(orig_attr, torch.nn.Module):
+                    placeholder = partition.graph.get_attr(orig_node.target)
+                    partition.targets[orig_node.target] = orig_attr
+                else:
+                    placeholder = partition.graph.placeholder(
+                        inp,
+                        type_expr=orig_nodes[inp].type,
+                    )
+                    new_inputs[inp] = None
+            else:
+                placeholder = partition.graph.placeholder(
+                    inp,
+                    type_expr=orig_nodes[inp].type,
+                )
+                new_inputs[inp] = None
             placeholder.meta = orig_nodes[inp].meta.copy()
             partition.environment[orig_nodes[inp]] = placeholder
+        partition.inputs = new_inputs
 
     # Transform nodes and collect targets for partition's submodule
     for node in m.graph.nodes:
@@ -435,16 +462,8 @@ def instantiate_node_partition_mapping(node):
             if node.op not in ["call_module", "get_attr"]:
                 target = node.target
             else:
-                target_atoms = node.target.split(".")
-                target_attr = m
-                for atom in target_atoms:
-                    if not hasattr(target_attr, atom):
-                        raise AttributeError(
-                            f"Operator target {node.target} not found!"
-                        )
-                    target_attr = getattr(target_attr, atom)
-                # target = target_atoms[-1]
-                target = "_".join(target_atoms)
+                target_attr = _get_attr_from_qualname(m, node.target)
+                target = node.target.replace(".", "_")
                 partition.targets[target] = target_attr
                 # Fill in the passed-in mapping from new qualname to old qualname
                 if qualname_map is not None:

From 7d283309d808b9c080f96eefa8b62b60118034c1 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Thu, 24 Oct 2024 17:55:39 +0000
Subject: [PATCH 082/161] Avoid calling `realize()` on LazyVariableTracker on
 reconstruct (#138495)

Fixes: https://github.com/pytorch/pytorch/issues/137686

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138495
Approved by: https://github.com/zou3519
---
 test/dynamo/test_reconstruct.py  |  5 ++---
 torch/_dynamo/variables/dicts.py | 16 ++++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/test/dynamo/test_reconstruct.py b/test/dynamo/test_reconstruct.py
index c4fee05889b72..f78660ae248e7 100644
--- a/test/dynamo/test_reconstruct.py
+++ b/test/dynamo/test_reconstruct.py
@@ -16,9 +16,9 @@ def _filter_instructions(instructions, opname):
 
 class ReconstructTest(torch._dynamo.test_case.TestCase):
     @contextlib.contextmanager
-    def register_bytecode_hook(self, check_fn):
+    def register_bytecode_hook(self, fn):
         def hook(code, out_code):
-            check_fn(list(dis.get_instructions(out_code)))
+            fn(list(dis.get_instructions(out_code)))
             return code
 
         torch._dynamo.reset()
@@ -40,7 +40,6 @@ def hook(instructions: List[dis.Instruction]):
             self.assertEqual(build_map[0].argval, 1)
 
         def f(d, t):
-            d[1] = t
             d[40] = t + 1
 
         t = torch.randn(3, 4)
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 2d382159dbe69..b1688060db3ac 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -203,10 +203,13 @@ def len(self):
             ]
         )
 
-    def _maybe_realize(self, item):
-        return item.realize() if item else item
-
     def reconstruct(self, codegen):
+        def is_new_item(value, other):
+            # compare the id of the realized values if both values are not lazy VTs
+            if value and value.is_realized() and other.is_realized():
+                return id(value.realize()) != id(other.realize())
+            return id(value) != id(other)
+
         # instructions to load collections.OrderedDict if necessary
         if self.user_cls is collections.OrderedDict:
             codegen.add_push_null(
@@ -221,11 +224,8 @@ def reconstruct(self, codegen):
         num_args = 0
         for key, value in self.items.items():
             # We can safely call realize() here as it won't introduce any new guards
-            is_new_item = (
-                self._maybe_realize(self.original_items.get(key.vt)) != value.realize()
-            )
-
-            if is_new_item or self.should_reconstruct_all:
+            item = self.original_items.get(key.vt)
+            if is_new_item(item, value) or self.should_reconstruct_all:
                 codegen(key.vt)
                 codegen(value)
                 num_args += 1

From c6bb9b53f4d49dbfb2a19553731f8fb01e19dc96 Mon Sep 17 00:00:00 2001
From: Yidi Wu <yidi@meta.com>
Date: Wed, 23 Oct 2024 16:49:09 -0700
Subject: [PATCH 083/161] [scan] better error handling and remove redundant
 tests (#137967)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137967
Approved by: https://github.com/zou3519
---
 test/functorch/test_control_flow.py | 263 +++++++++++-----------------
 torch/_higher_order_ops/__init__.py |   2 +
 torch/_higher_order_ops/scan.py     | 153 +++++++++-------
 torch/testing/_internal/hop_db.py   |  98 ++++++++---
 4 files changed, 270 insertions(+), 246 deletions(-)

diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index a0f82ded433c5..200902b3dea44 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -1648,10 +1648,16 @@ def test_scan_binary_operator(self, reverse, device):
         A = torch.randn(state_dim, requires_grad=True, device=device)
         elements = (A.repeat((timesteps, 1)), projected_inputs)
         init = tuple(
-            [torch.ones_like(torch._ops.ops.aten.slice(elements[0], 0, 0, 1, 1))]
+            [
+                torch.ones_like(
+                    torch._ops.ops.aten.slice(elements[0], 0, 0, 1, 1),
+                    requires_grad=True,
+                )
+            ]
             + [
                 torch.zeros_like(
-                    torch._ops.ops.aten.slice(projected_inputs, 0, 0, 1, 1)
+                    torch._ops.ops.aten.slice(projected_inputs, 0, 0, 1, 1),
+                    requires_grad=True,
                 )
             ]
         )
@@ -1812,8 +1818,7 @@ def fct_pointwise(x, y):
         self.assertEqual(result, expected_result)
 
     @requires_cuda
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_wrong_pytree(self, device):
+    def test_scan_wrong_pytree(self):
         # Init and input have same pytree
         def fct_wrong_pytree(x, y):
             return (
@@ -1829,9 +1834,9 @@ def fct_wrong_pytree(x, y):
                 },
             )
 
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
-        z = torch.randn(3, 2, 2, device=device)
+        x = torch.randn(3, 2, 2)
+        y = torch.randn(3, 2, 2)
+        z = torch.randn(3, 2, 2)
         inp = {"i": x, "j": ([y], [{"o": z}])}
         inp_flat, inp_spec = pytree.tree_flatten(inp)
         init_flat = [torch._ops.ops.aten.slice(e, 0, 0, 1, 1) for e in inp_flat]
@@ -1841,8 +1846,8 @@ def fct_wrong_pytree(x, y):
             # Should be: RuntimeError,
             # r"The number of leaves of the pytree of the new carry produced by
             # the operator needs to match the length of the pytree of the init",
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
+            RuntimeError,
+            "The number of leaves of the pytree of the new carry",
         ):
             result = scan(fct_wrong_pytree, init, inp, dim=0)
 
@@ -2307,110 +2312,135 @@ def test_scan_compile_cnt(self, reverse, device):
             self.assertEqual(cnt.frame_count, 6)
 
     @requires_cuda
-    @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_init_scanned_0(self, reverse, compile_mode, device):
+    def test_scan_init_scanned_0(self, compile_mode):
         scan_fct = compile_mode_helper(scan, compile_mode)
 
         # Only init and no input
-        x = torch.randn(3, 1, 2, device=device)
-        init = torch.randn(3, 2, device=device)
+        x = torch.randn(3, 1, 2)
+        init = torch.randn(3, 2)
         dim = 1
 
         # Scan dimension is 0
         init = torch._ops.ops.aten.slice(x, dim, 0, 1, 1)
         inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError, "Input leaves must have a scan dimension > 0"
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            result_init = scan_fct(
-                get_scan_combine_fn("add", False),
-                init,
-                inp,
-                dim=dim,
-                reverse=reverse,
-            )
+        if compile_mode == "none":
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "xs leaves must have a scan dimension > 0",
+            ):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False),
+                    init,
+                    inp,
+                    dim=dim,
+                )
+        else:
+            with self.assertRaisesRegex(
+                # Should be: RuntimeError, "Input leaves must have a scan dimension > 0"
+                torch._dynamo.exc.Unsupported,
+                "Observed exception.*",
+            ):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False),
+                    init,
+                    inp,
+                    dim=dim,
+                )
 
     @requires_cuda
-    @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_init_non_tensor(self, reverse, compile_mode, device):
+    def test_scan_init_non_tensor(self, compile_mode):
         scan_fct = compile_mode_helper(scan, compile_mode)
 
-        x = torch.randn(3, 1, 2, device=device)
+        x = torch.randn(3, 1, 2)
         dim = 1
 
         # Init is a float and not a tensor
         init = 1.0
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError, "Init leaves must be a Tensor"
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            result_init = scan_fct(
-                get_scan_combine_fn("add", False), init, x, dim=dim, reverse=reverse
-            )
+        if compile_mode == "none":
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "All init leaves must be a Tensor",
+            ):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False), init, x, dim=dim
+                )
+        else:
+            with self.assertRaisesRegex(
+                # Should be: RuntimeError, "Init leaves must be a Tensor"
+                torch._dynamo.exc.Unsupported,
+                "Observed exception.*",
+            ):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False), init, x, dim=dim
+                )
 
     @requires_cuda
-    @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_init_wrong_shape(self, reverse, compile_mode, device):
+    def test_scan_init_wrong_shape(self, compile_mode):
         scan_fct = compile_mode_helper(scan, compile_mode)
 
         # Only init and no input
-        x = torch.randn(3, 1, 2, device=device)
+        x = torch.randn(3, 1, 2)
         dim = 1
 
         # Init wrong shape (Other dim different)
-        inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
-        init = torch._ops.ops.aten.slice(x, dim, 0, 1, 1)
-        init = torch.tile(init, (1, 2, 1))
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError, "The size of tensor a.*"
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            result_init = scan_fct(
-                get_scan_combine_fn("add", False),
-                init,
-                inp,
-                dim=dim,
-                reverse=reverse,
-            )
+        init = torch.randn(1, 2)
+        if compile_mode == "none":
+            with self.assertRaisesRegex(RuntimeError, "The shape of the new_carry"):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False),
+                    init,
+                    x,
+                    dim=dim,
+                )
+        else:
+            with self.assertRaisesRegex(
+                # Should be: RuntimeError, "The size of tensor a.*"
+                torch._dynamo.exc.Unsupported,
+                "Observed exception.*",
+            ):
+                result_init = scan_fct(
+                    get_scan_combine_fn("add", False),
+                    init,
+                    x,
+                    dim=dim,
+                )
 
     @requires_cuda
-    @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager"])
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_init_wrong_pytree(self, reverse, compile_mode, device):
+    def test_scan_init_wrong_pytree(self, compile_mode):
         def add_one_carry(x: torch.Tensor, y: torch.Tensor):
             return x[0], x
 
         scan_fct = compile_mode_helper(scan, compile_mode)
 
         # Only init and no input
-        x = torch.randn(3, 1, 2, device=device)
+        x = torch.randn(3, 1, 2)
         dim = 1
 
         # Init wrong pytree
-        inp = torch._ops.ops.aten.slice(x, dim, 1, None, 1)
         init = (
             torch._ops.ops.aten.slice(x, dim, 0, 1, 1),
             torch._ops.ops.aten.slice(x, dim, 0, 1, 1),
         )
 
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError: The number of leaves of the pytree of the new carry produced
-            # by the operator needs to match the length of the pytree of the init
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            result_init = scan_fct(add_one_carry, init, inp, dim=dim, reverse=reverse)
+        if compile_mode == "none":
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "The number of leaves of the pytree of the new carry produced by the operator",
+            ):
+                result_init = scan_fct(add_one_carry, init, x, dim=dim)
+
+        else:
+            with self.assertRaisesRegex(
+                # Should be: RuntimeError: The number of leaves of the pytree of the new carry produced
+                # by the operator needs to match the length of the pytree of the init
+                torch._dynamo.exc.Unsupported,
+                "Observed exception.*",
+            ):
+                result_init = scan_fct(add_one_carry, init, x, dim=dim)
 
     @requires_cuda
     @parametrize("reverse", [False, True])
@@ -2514,54 +2544,6 @@ def add_scalar_carry_sliced_out(x: torch.Tensor, y: torch.Tensor):
             result[1] = pytree.tree_map(lambda t: t.movedim(0, dim), result[1])
             self.assertEqual(result[1], result_exp_PT)
 
-    @requires_cuda
-    @parametrize("reverse", [False, True])
-    @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_scan_carry_wrong_pytree(self, reverse, device):
-        def fct_pointwise_carry_wrong_pytree(x, y):
-            return (
-                (
-                    x["i"],
-                    {
-                        "i": x["i"] * y["i"],
-                        "j": (
-                            [x["j"][0][0] * y["j"][0][0]],
-                            [{"o": x["j"][1][0]["o"] + y["j"][1][0]["o"]}],
-                        ),
-                    },
-                ),
-                {
-                    "i": x["i"] * y["i"],
-                    "j": (
-                        [x["j"][0][0] * y["j"][0][0]],
-                        [{"o": x["j"][1][0]["o"] + y["j"][1][0]["o"]}],
-                    ),
-                },
-            )
-
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
-        z = torch.randn(3, 2, 2, device=device)
-        inp = {"i": x, "j": ([y], [{"o": z}])}
-        inp_flat, inp_spec = pytree.tree_flatten(inp)
-        init_flat = [torch._ops.ops.aten.slice(e, 0, 0, 1, 1) for e in inp_flat]
-        init = pytree.tree_unflatten(init_flat, inp_spec)
-
-        # Wrong pytree of the carry produced by the operation
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError: The number of leaves of the pytree of the new carry
-            # produced by the operator needs to match the length of the pytree of the init
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            result = scan(
-                fct_pointwise_carry_wrong_pytree,
-                init,
-                inp,
-                dim=0,
-                reverse=reverse,
-            )
-
     @requires_cuda
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
@@ -2753,45 +2735,6 @@ def RNN(x: torch.Tensor, y: torch.Tensor):
         self.assertEqual(result[0].unsqueeze(0), expected_result_state)
         self.assertEqual(result[1], expected_result[0])
 
-    @skipIfNoDynamoSupport
-    def test_scan_simple_graph_no_carry(self):
-        x = torch.randn(3, 10, 2, device=torch.device("cpu"))
-        init = torch.randn(1, 10, 2, device=torch.device("cpu"))
-
-        def f(fct, init, xs):
-            return scan(fct, init, xs, dim=0, reverse=True)
-
-        # Wrong number of returns from function
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError: The pytree of the new carry produced
-            # by the operator needs to match the pytree of the init
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            gm = make_fx(f, tracing_mode="symbolic")(
-                get_scan_combine_fn("add", True), init, x
-            )
-
-    @skipIfNoDynamoSupport
-    def test_scan_simple_graph_wrong_carry(self):
-        def add_wrong_carry(x: torch.Tensor, y: torch.Tensor):
-            return (x + y)[0, :], x + y
-
-        x = torch.randn(3, 10, 2, device=torch.device("cpu"))
-        init = torch.randn(1, 10, 2, device=torch.device("cpu"))
-
-        def f(fct, init, xs):
-            return scan(fct, init, xs, dim=0, reverse=True)
-
-        # Wrong carry shape
-        with self.assertRaisesRegex(
-            # Should be: RuntimeError: The pytree of the new carry produced by
-            # the operator needs to match the pytree of the init
-            torch._dynamo.exc.Unsupported,
-            "Observed exception.*",
-        ):
-            gm = make_fx(f, tracing_mode="symbolic")(add_wrong_carry, init, x)
-
     @skipIfNoDynamoSupport
     def test_scan_simple_graph_wrong_dtype(self):
         def add_wrong_dtype(x: torch.Tensor, y: torch.Tensor):
@@ -2808,10 +2751,10 @@ def f(fct, init, xs):
             # Should be: RuntimeError: Expected the init and
             # the new carry produced by the operator to be a tensor of
             # torch.int64 but got torch.float32 and torch.int64
-            torch._dynamo.exc.UncapturedHigherOrderOpError,
-            ".*",
+            RuntimeError,
+            "The dtype of the new_carry",
         ):
-            gm = make_fx(f, tracing_mode="symbolic")(add_wrong_dtype, init, x)
+            f(add_wrong_dtype, init, x)
 
     @skipIfNoDynamoSupport
     @skipIfCrossRef  # Arg order changes with crossref
@@ -2856,7 +2799,7 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor):
     l_init_ = L_init_
     l_xs_ = L_xs_
     select = l_xs_.select(0, 0)
-    out_l = l_init_ + select;  out_l = None
+    new_carry = l_init_ + select;  new_carry = None
     add_1 = l_init_ + select;  select = add_1 = None
     child = l_init_.clone();  child = None
     child_1 = torch.select_copy(l_xs_, 0, 0);  child_1 = None
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index f961a9f43cbf7..8c78306699f5e 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -5,6 +5,7 @@
 )
 from torch._higher_order_ops.hints_wrap import hints_wrapper
 from torch._higher_order_ops.invoke_subgraph import invoke_subgraph
+from torch._higher_order_ops.scan import scan
 from torch._higher_order_ops.while_loop import while_loop
 
 
@@ -12,6 +13,7 @@
     "cond",
     "while_loop",
     "invoke_subgraph",
+    "scan",
     "flex_attention",
     "flex_attention_backward",
     "hints_wrapper",
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index 5c34ac8f248a8..a5a08fea26a31 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -111,12 +111,86 @@ def add(x: torch.Tensor, y: torch.Tensor):
     if not isinstance(reverse, bool):
         raise RuntimeError("Reverse must be a bool, but got " + str(type(reverse)))
 
+    leaves_init, spec_init = pytree.tree_flatten(init)
+    leaves_xs, spec_xs = pytree.tree_flatten(xs)
+
+    if len(leaves_init) == 0:
+        raise RuntimeError("Init tensors must be provided")
+    for x in leaves_init:
+        if not isinstance(x, torch.Tensor):
+            raise RuntimeError(f"All init leaves must be a Tensor but got {x}")
+    for x in leaves_xs:
+        if not isinstance(x, torch.Tensor):
+            raise RuntimeError(f"All xs leaves must be a Tensor but got {x}")
+        if x.shape[dim] == 0:
+            raise RuntimeError(
+                f"All xs leaves must have a scan dimension > 0 but got {x}"
+            )
+
+    if len(leaves_xs) == 0:
+        return pytree.tree_unflatten(leaves_init, spec_init), xs
+
+    shape = leaves_xs[0].shape
+    ndim = len(shape)
+    dim = utils.canonicalize_dim(ndim, dim)
+
+    out = combine_fn(
+        pytree.tree_unflatten(leaves_init, spec_init),
+        pytree.tree_unflatten([elem.select(dim, 0) for elem in leaves_xs], spec_xs),
+    )
+
+    # The first output needs to have the same pytree as init
+    carry_leaves = pytree.tree_leaves(out[0])
+    if len(carry_leaves) != len(leaves_init):
+        raise RuntimeError(
+            f"The number of leaves of the pytree of the new carry produced by the operator is {len(carry_leaves)}\
+doesn't match the length of the pytree of the init {len(leaves_init)}"
+        )
+
+    def _check_new_carry_match_init(leaves_init, carry_leaves):
+        for i, (init, new_carry) in enumerate(zip(leaves_init, carry_leaves)):
+            if init.shape != new_carry.shape:
+                raise RuntimeError(
+                    f"The shape of the new_carry[{i}] {new_carry.shape} doesn't match that of the init[{i}] {init.shape}."
+                )
+            if init.stride() != new_carry.stride():
+                raise RuntimeError(
+                    f"The stride of the new_carry[{i}] {new_carry.stride()} doesn't match that of the init[{i}] {init.stride()}."
+                )
+            if init.dtype != new_carry.dtype:
+                raise RuntimeError(
+                    f"The dtype of the new_carry[{i}] {new_carry.dtype} doesn't match that of the init[{i}] {init.dtype}."
+                )
+            if init.requires_grad != new_carry.requires_grad:
+                raise RuntimeError(
+                    f"The requires_grad of the new_carry[{i}] {new_carry.requires_grad} doesn't match that of the init[{i}] {init.requires_grad}."  # noqa: B950
+                )
+
+    _check_new_carry_match_init(leaves_init, carry_leaves)
+
+    # There are no pytree restrictions on the second output of the operator
+    out_leaves, tree_out = pytree.tree_flatten(out[1])
+
     # TODO: Support closures/nn_modules in order to be able represent RNNs with scan
     # TODO: Support _inductor lowering
     # TODO: Support Autograd
     # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
     # TODO: Unify the list inputs of control flow ops to tuple.
 
+    combine_fn = functools.partial(
+        wrap_combine_fn_flat,
+        combine_fn=combine_fn,
+        spec_init=spec_init,
+        spec_xs=spec_xs,
+        num_init_leaves=len(leaves_init),
+        num_inp_leaves=len(leaves_xs),
+    )
+
+    def run_flattened_scan(combine_fn, leaves_init, leaves_xs, dim, reverse):
+        return scan_op(
+            combine_fn, leaves_init, leaves_xs, dim, reverse, additional_inputs=[]
+        )
+
     if not torch._dynamo.is_compiling():
         from torch._dynamo.backends.debugging import (
             make_eager_backend_with_torch_function_mode,
@@ -128,71 +202,26 @@ def add(x: torch.Tensor, y: torch.Tensor):
                     backend = make_eager_backend_with_torch_function_mode(metadata_mode)
                 else:
                     backend = "eager"
-                return torch.compile(scan, backend=backend, fullgraph=True)(
-                    combine_fn, init, xs, dim=dim, reverse=reverse
+                result = torch.compile(
+                    run_flattened_scan, backend=backend, fullgraph=True
+                )(
+                    combine_fn,
+                    leaves_init,
+                    leaves_xs,
+                    dim=dim,
+                    reverse=reverse,
                 )
+    else:
+        result = run_flattened_scan(combine_fn, leaves_init, leaves_xs, dim, reverse)
 
-    leaves_init, spec_init = pytree.tree_flatten(init)
-    leaves_xs, spec_xs = pytree.tree_flatten(xs)
-
-    if len(leaves_init) == 0:
-        raise RuntimeError("Init tensors must be provided")
-    if any(not isinstance(x, torch.Tensor) for x in leaves_init):
-        raise RuntimeError("All init leaves must be a Tensor")
-    if any(not isinstance(x, torch.Tensor) for x in leaves_xs):
-        raise RuntimeError("All xs leaves must be a Tensor")
-    if any(x.shape[dim] == 0 for x in leaves_xs):
-        raise RuntimeError("All xs leaves must have a scan dimension > 0")
-
-    if len(leaves_xs) > 0:
-        shape = leaves_xs[0].shape
-        ndim = len(shape)
-        dim = utils.canonicalize_dim(ndim, dim)
-
-        out = combine_fn(
-            pytree.tree_unflatten(leaves_init, spec_init),
-            pytree.tree_unflatten([elem.select(dim, 0) for elem in leaves_xs], spec_xs),
-        )
-
-        # The first output needs to have the same pytree as init
-        carry_leaves = pytree.tree_leaves(out[0])
-        if len(carry_leaves) != len(leaves_init):
-            raise RuntimeError(
-                "The number of leaves of the pytree of the new carry produced by the operator\
- needs to match the length of the pytree of the init"
-            )
-        if any(
-            in_l.shape != out_l.shape for in_l, out_l in zip(leaves_init, carry_leaves)
-        ):
-            raise RuntimeError(
-                "The pytree of the new carry produced by the operator needs to match the pytree of the init"
-            )
-
-        # There are no pytree restrictions on the second output of the operator
-        out_leaves, tree_out = pytree.tree_flatten(out[1])
-
-        combine_fn = functools.partial(
-            wrap_combine_fn_flat,
-            combine_fn=combine_fn,
-            spec_init=spec_init,
-            spec_xs=spec_xs,
-            num_init_leaves=len(leaves_init),
-            num_inp_leaves=len(leaves_xs),
-        )
-
-        result_carry, result_flat = _extract_carry_and_out(
-            scan_op(
-                combine_fn, leaves_init, leaves_xs, dim, reverse, additional_inputs=[]
-            ),
-            len(leaves_init),
-        )
-
-        return pytree.tree_unflatten(result_carry, spec_init), pytree.tree_unflatten(
-            result_flat, tree_out
-        )
+    result_carry, result_flat = _extract_carry_and_out(
+        result,
+        len(leaves_init),
+    )
 
-    else:
-        return pytree.tree_unflatten(leaves_init, spec_init), xs
+    return pytree.tree_unflatten(result_carry, spec_init), pytree.tree_unflatten(
+        result_flat, tree_out
+    )
 
 
 class ScanOp(HigherOrderOperator):
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 9b7f7d9b26333..25bf3f16806a6 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -1,47 +1,58 @@
 # mypy: ignore-errors
 
-import torch
 import functools
-from torch.testing import make_tensor
 import unittest
+
+import torch
 from functorch.experimental.control_flow import map
-from torch.testing._internal.opinfo.core import (
-    OpInfo,
-    SampleInput,
-)
-from torch.testing._internal.common_dtype import all_types_and, custom_types
-from torch.testing._internal.opinfo.core import DecorateInfo
+from torch.nn.attention.flex_attention import _create_empty_block_mask, flex_attention
+from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import onlyCUDA
-from torch.nn.attention.flex_attention import flex_attention, _create_empty_block_mask
+from torch.testing._internal.common_dtype import all_types_and, custom_types
+from torch.testing._internal.opinfo.core import DecorateInfo, OpInfo, SampleInput
+
 
 def sample_inputs_map(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
-        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    yield SampleInput([make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)],
-                      args=(make_arg(1, low=0.1, high=2), make_arg(1, low=0.1, high=2)))
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    yield SampleInput(
+        [make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)],
+        args=(make_arg(1, low=0.1, high=2), make_arg(1, low=0.1, high=2)),
+    )
+
 
 def inner_f(x, y0, y1):
-    return [x[0].cos().add_(1.) * y0, (x[1] + y1.sin()).cos_().view(x[1].size())]
+    return [x[0].cos().add_(1.0) * y0, (x[1] + y1.sin()).cos_().view(x[1].size())]
+
 
 def simple_map(xs, y0, y1):
     def f(x, y0, y1):
         return inner_f(x, y0, y1)
+
     return map(f, xs, y0, y1)
 
+
 def nested_map(xs, y0, y1):
     def f1(xx, y0, y1):
         def f2(x, y0, y1):
             return inner_f(x, y0, y1)
+
         return map(f2, xx, y0, y1)
+
     return map(f1, xs, y0, y1)
 
+
 def triple_nested_map(xs, y0, y1):
     def f0(xs, y0, y1):
         def f1(xx, y0, y1):
             def f2(x, y0, y1):
                 return inner_f(x, y0, y1)
+
             return map(f2, xx, y0, y1)
+
         return map(f1, xs, y0, y1)
+
     return map(f0, xs, y0, y1)
 
 
@@ -108,16 +119,21 @@ def sample_inputs_invoke_subgraph(opinfo, device, dtype, requires_grad, **kwargs
     )
     yield SampleInput(make_arg(2, 2, 2, low=0.1, high=2))
 
+
 def simple_invoke_subgraph(x):
     def fn(x):
         return (torch.sin(x),)
+
     return torch._higher_order_ops.invoke_subgraph(fn, None, (x,))
 
+
 def sample_inputs_auto_functionalize(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
         make_tensor, device=device, dtype=dtype, requires_grad=False
     )
-    yield SampleInput(make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2))
+    yield SampleInput(
+        make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)
+    )
 
 
 def simple_auto_functionalize(x, z):
@@ -134,13 +150,8 @@ def score_mod(score, b, h, m, n):
 
     q, k, v = (make_arg(2, 2, 128, 8, low=0.1, high=2) for _ in range(3))
     block_mask = _create_empty_block_mask(q, k)
-    yield SampleInput(
-        q,
-        k,
-        v,
-        score_mod,
-        block_mask
-    )
+    yield SampleInput(q, k, v, score_mod, block_mask)
+
 
 def sample_inputs_while_loop(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
@@ -151,6 +162,7 @@ def sample_inputs_while_loop(opinfo, device, dtype, requires_grad, **kwargs):
         make_arg(2, 3, 4, low=0.1, high=2),
     )
 
+
 def simple_while_loop(iter_t, x):
     def cond_fn(iter_t, x):
         return iter_t > 0
@@ -161,7 +173,41 @@ def body_fn(iter_t, x):
     return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
 
 
+def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    yield SampleInput(
+        make_arg(2, 2, low=0.1, high=2),
+        make_arg(2, 2, 2, low=0.1, high=2),
+    )
+
+
+def simple_scan(init, xs):
+
+    def combine_fn(carry, x):
+        result = carry @ x + x
+        return result, carry.clone()
+
+    return torch._higher_order_ops.scan(combine_fn, init, xs)
+
+
 hop_db = [
+    OpInfo(
+        name="scan",
+        variant_test_name="simple",
+        op=simple_scan,
+        sample_inputs_func=sample_inputs_scan,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+        # "torch.compile with aot_autograd does not currently support double backward."
+        supports_gradgrad=False,
+    ),
     OpInfo(
         name="invoke_subgraph",
         variant_test_name="simple",
@@ -267,7 +313,9 @@ def body_fn(iter_t, x):
         check_inplace_batched_forward_grad=False,
         skips=(
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_aot_export"),
-            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"
+            ),
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
         ),
@@ -286,10 +334,12 @@ def body_fn(iter_t, x):
         check_inplace_batched_forward_grad=False,
         skips=(
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_aot_export"),
-            DecorateInfo(unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestHOP", "test_pre_dispatch_export"
+            ),
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_serialize_export"),
             DecorateInfo(unittest.expectedFailure, "TestHOP", "test_retrace_export"),
         ),
         decorators=[onlyCUDA],
-    )
+    ),
 ]

From 846b4e614b16990c985187b7c743b6f5834932a9 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Fri, 25 Oct 2024 19:03:39 +0000
Subject: [PATCH 084/161] [TF32][cuDNN][Convolution] Add some missing TF32
 decorators (#138768)

Newer cuDNN versions seem to be able to dispatch to cuDNN kernels

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138768
Approved by: https://github.com/Skylion007
---
 test/nn/test_convolution.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index aa9a69bbfd228..27c57f302d193 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -724,6 +724,7 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    @tf32_on_and_off(0.001)
     @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
@@ -769,6 +770,7 @@ def test_Conv2d_groups_nobias(self):
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    @tf32_on_and_off(0.001)
     @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_groups_nobias_v2(self):
         torch.manual_seed(123)
@@ -3396,6 +3398,7 @@ def test_ConvTranspose3d_size_1_kernel(self, device):
     )
     @dtypes(torch.float)
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    @tf32_on_and_off(0.001)
     @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7")
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions

From b988388bacdf8902b502fe18b7401133e8d6f4ba Mon Sep 17 00:00:00 2001
From: Ting Lu <tingl@nvidia.com>
Date: Fri, 25 Oct 2024 19:10:07 +0000
Subject: [PATCH 085/161] Add CUDA 12.6 to Linux CD docker images (#138563)

Reference https://github.com/pytorch/builder/pull/1003/files
Related to #138440

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138563
Approved by: https://github.com/malfet
---
 .ci/docker/libtorch/Dockerfile               | 5 +++++
 .github/workflows/build-libtorch-images.yml  | 2 +-
 .github/workflows/build-manywheel-images.yml | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index 2c73f55aff319..187e47724aa87 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -66,6 +66,11 @@ RUN bash ./install_cuda.sh 12.4
 RUN bash ./install_magma.sh 12.4
 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
 
+FROM cuda as cuda12.6
+RUN bash ./install_cuda.sh 12.6
+RUN bash ./install_magma.sh 12.6
+RUN ln -sf /usr/local/cuda-12.6 /usr/local/cuda
+
 FROM cpu as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index 5e24788f86acf..abacbda450559 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -44,7 +44,7 @@ jobs:
     runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
     strategy:
       matrix:
-        cuda_version: ["12.4", "12.1", "11.8"]
+        cuda_version: ["12.6", "12.4", "12.1", "11.8"]
     env:
       GPU_ARCH_TYPE: cuda
       GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index 681bd82d44065..4c77c669994ea 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -48,7 +48,7 @@ jobs:
     runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
     strategy:
       matrix:
-        cuda_version: ["12.4", "12.1", "11.8"]
+        cuda_version: ["12.6", "12.4", "12.1", "11.8"]
     env:
       GPU_ARCH_TYPE: cuda
       GPU_ARCH_VERSION: ${{ matrix.cuda_version }}

From 3a0c361899b1a49636902a2a4acc8b5eb8fe450d Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@meta.com>
Date: Fri, 25 Oct 2024 19:13:52 +0000
Subject: [PATCH 086/161] Remove presere ops (#138371)

Summary:
CI
#buildall

Test Plan: CI

Reviewed By: StellarrZ

Differential Revision: D64151426

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138371
Approved by: https://github.com/bdhirsh
---
 .ci/docker/ci_commit_pins/executorch.txt      |   2 +-
 test/export/test_export.py                    | 148 +++++-------------
 .../test_export_training_ir_to_run_decomp.py  |   9 --
 torch/_decomp/__init__.py                     |   4 -
 torch/export/exported_program.py              |  23 ---
 5 files changed, 40 insertions(+), 146 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 7199bb8d866d4..9aaea8851d475 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-ca4783992ed7602a39528ba304d61f00396b2a5a
+export-D64151426
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 8b5e67ef8fe83..d00ceac949cf8 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -1280,12 +1280,7 @@ def forward(self, x):
                     return torch.ops.mylib.foo123(lin)
 
             x = torch.randn(4, 4)
-            if IS_FBCODE:
-                ep = export(Bar(), (x,)).run_decompositions(
-                    decomp_table=None, _preserve_ops=(torch.ops.aten.linear.default,)
-                )
-            else:
-                ep = export(Bar(), (x,)).run_decompositions(table)
+            ep = export(Bar(), (x,)).run_decompositions(table)
 
             self.assertExpectedInline(
                 str(ep.graph_module.code).strip(),
@@ -1307,14 +1302,9 @@ def forward(self, x):
                 return torch.ops.aten.chunk.default(x, 3, 0)
 
         ep = torch.export.export(Foo(), (torch.randn(3, 3),))
-        if IS_FBCODE:
-            ep = ep.run_decompositions(
-                {}, _preserve_ops=(torch.ops.aten.linear.default,)
-            )
-        else:
-            decomp_table = _decomp_table_to_post_autograd_aten()
-            del decomp_table[torch.ops.aten.linear.default]
-            ep = ep.run_decompositions(decomp_table)
+        decomp_table = _decomp_table_to_post_autograd_aten()
+        del decomp_table[torch.ops.aten.linear.default]
+        ep = ep.run_decompositions(decomp_table)
 
         gm = ep.graph_module
         # linear is CompositeImplicitAutograd functional op so we should preserve it
@@ -1770,13 +1760,7 @@ def forward(self, x, y):
         ep = torch.export.export(
             Foo(), (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50))
         )
-        if IS_FBCODE:
-            ep_has_linear_convd = ep.run_decompositions(
-                {},
-                _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY,
-            )
-        else:
-            ep_has_linear_convd = ep.run_decompositions({})
+        ep_has_linear_convd = ep.run_decompositions({})
 
         self.assertExpectedInline(
             str(ep_has_linear_convd.graph_module.code).strip(),
@@ -1791,19 +1775,11 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_
     return (add,)""",
         )
 
-        if IS_FBCODE:
-            ep_has_convd = ep.run_decompositions(
-                _preserve_ops=(
-                    torch.ops.aten.conv2d.default,
-                    torch.ops.aten.conv1d.default,
-                )
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.conv2d.default]
-            del decomp_table[torch.ops.aten.conv1d.default]
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.conv2d.default]
+        del decomp_table[torch.ops.aten.conv1d.default]
 
-            ep_has_convd = ep.run_decompositions(decomp_table=decomp_table)
+        ep_has_convd = ep.run_decompositions(decomp_table=decomp_table)
         self.assertExpectedInline(
             str(ep_has_convd.graph_module.code).strip(),
             """\
@@ -1819,15 +1795,10 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_
     add = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
     return (add,)""",
         )
-        if IS_FBCODE:
-            ep_has_convd = ep_has_convd.run_decompositions(
-                _preserve_ops=(torch.ops.aten.conv2d.default,)
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.conv2d.default]
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.conv2d.default]
 
-            ep_has_convd = ep_has_convd.run_decompositions(decomp_table=decomp_table)
+        ep_has_convd = ep_has_convd.run_decompositions(decomp_table=decomp_table)
         self.assertExpectedInline(
             str(ep_has_convd.graph_module.code).strip(),
             """\
@@ -1871,15 +1842,9 @@ def forward(self, x, y):
             Foo(), (torch.randn(20, 16, 50, 100), torch.randn(20, 16, 50))
         )
 
-        if IS_FBCODE:
-            ep_has_linear_convd = ep.run_decompositions(
-                {},
-                _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY,
-            )
-        else:
-            ep_has_linear_convd = ep.run_decompositions(
-                decomp_table={},
-            )
+        ep_has_linear_convd = ep.run_decompositions(
+            decomp_table={},
+        )
 
         self.assertExpectedInline(
             str(ep_has_linear_convd.graph_module.code).strip(),
@@ -1894,19 +1859,11 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, b_
     return (add,)""",
         )
 
-        if IS_FBCODE:
-            ep_has_convd = ep.run_decompositions(
-                _preserve_ops=(
-                    torch.ops.aten.conv2d.default,
-                    torch.ops.aten.conv1d.default,
-                )
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.conv2d.default]
-            del decomp_table[torch.ops.aten.conv1d.default]
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.conv2d.default]
+        del decomp_table[torch.ops.aten.conv1d.default]
 
-            ep_has_convd = ep.run_decompositions(decomp_table=decomp_table)
+        ep_has_convd = ep.run_decompositions(decomp_table=decomp_table)
 
         self.assertExpectedInline(
             str(ep_has_convd.graph_module.code).strip(),
@@ -1924,14 +1881,9 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, b_
     return (add,)""",
         )
 
-        if IS_FBCODE:
-            ep_has_convd = ep_has_convd.run_decompositions(
-                _preserve_ops=(torch.ops.aten.conv2d.default,)
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.conv2d.default]
-            ep_has_convd = ep_has_convd.run_decompositions(decomp_table=decomp_table)
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.conv2d.default]
+        ep_has_convd = ep_has_convd.run_decompositions(decomp_table=decomp_table)
 
         self.assertExpectedInline(
             str(ep_has_convd.graph_module.code).strip(),
@@ -2033,14 +1985,9 @@ def forward(self, x):
                 return x.sin() + x.sum()
 
         ep = export(Foo(), (torch.ones(3, 3),))
-        if IS_FBCODE:
-            ep_preserve_sum = ep.run_decompositions(
-                _preserve_ops=(torch.ops.aten.sum.default,)
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.sum.default]
-            ep_preserve_sum = ep.run_decompositions(decomp_table)
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.sum.default]
+        ep_preserve_sum = ep.run_decompositions(decomp_table)
 
         # Even though we are decomposing to core aten which should make
         # sum into sum.dim_IntList, we explicitly marked it to not do that.
@@ -8199,14 +8146,10 @@ def forward(self, x):
                 return torch.ops.testlib.foo_mutated.default(y)
 
         decomp_table = torch.export.default_decompositions()
+        del decomp_table[torch.ops.testlib.foo_functional.default]
 
-        # FIXME (We need to design a proper way that doesn't need _preserve_ops)
         ep = torch.export.export(M(), (torch.randn(4, 4),)).run_decompositions(
             decomp_table,
-            _preserve_ops=(
-                torch.ops.testlib.foo_functional.default,
-                torch.ops.testlib.foo_mutated.default,
-            ),
         )
 
         self.assertExpectedInline(
@@ -8241,14 +8184,9 @@ def forward(self, x):
             },
         )
 
-        if IS_FBCODE:
-            ep = ep.run_decompositions(
-                {}, _preserve_ops=(torch.ops.aten.linear.default,)
-            )
-        else:
-            table = torch.export.default_decompositions()
-            del table[torch.ops.aten.linear.default]
-            ep = ep.run_decompositions(table)
+        table = torch.export.default_decompositions()
+        del table[torch.ops.aten.linear.default]
+        ep = ep.run_decompositions(table)
 
         comp_mod = ep.module()
         inp1 = torch.randn(3, 4)
@@ -9438,15 +9376,12 @@ def forward(self, x):
             ep.graph_module.code
         )
 
-        if IS_FBCODE:
-            ep = ep.run_decompositions(_preserve_ops=(torch.ops.aten.elu.default,))
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.elu.default]
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.elu.default]
 
-            ep = ep.run_decompositions(
-                decomp_table=decomp_table,
-            )
+        ep = ep.run_decompositions(
+            decomp_table=decomp_table,
+        )
         FileCheck().check_count("torch.ops.aten.elu.default", 1, exactly=True).run(
             ep.graph_module.code
         )
@@ -9468,16 +9403,11 @@ def forward(self, x):
             "torch.ops.aten.upsample_bilinear2d.vec", 1, exactly=True
         ).run(ep.graph_module.code)
 
-        if IS_FBCODE:
-            ep = ep.run_decompositions(
-                _preserve_ops=(torch.ops.aten.upsample_bilinear2d.vec,)
-            )
-        else:
-            decomp_table = default_decompositions()
-            del decomp_table[torch.ops.aten.upsample_bilinear2d.vec]
-            ep = ep.run_decompositions(
-                decomp_table=decomp_table,
-            )
+        decomp_table = default_decompositions()
+        del decomp_table[torch.ops.aten.upsample_bilinear2d.vec]
+        ep = ep.run_decompositions(
+            decomp_table=decomp_table,
+        )
 
         FileCheck().check_count(
             "torch.ops.aten.upsample_bilinear2d.vec", 1, exactly=True
diff --git a/test/export/test_export_training_ir_to_run_decomp.py b/test/export/test_export_training_ir_to_run_decomp.py
index b1168f54bb227..335f4ec7a0c19 100644
--- a/test/export/test_export_training_ir_to_run_decomp.py
+++ b/test/export/test_export_training_ir_to_run_decomp.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: export"]
 import torch
-from torch.testing._internal.common_utils import IS_FBCODE
 
 
 try:
@@ -16,10 +15,6 @@
 
 def mocked_training_ir_to_run_decomp_export_strict(*args, **kwargs):
     ep = torch.export.export_for_training(*args, **kwargs)
-    if IS_FBCODE:
-        return ep.run_decompositions(
-            {}, _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY
-        )
     return ep.run_decompositions({})
 
 
@@ -29,10 +24,6 @@ def mocked_training_ir_to_run_decomp_export_non_strict(*args, **kwargs):
     else:
         ep = torch.export.export_for_training(*args, **kwargs, strict=False)
 
-    if IS_FBCODE:
-        return ep.run_decompositions(
-            {}, _preserve_ops=testing._COMPOSITE_OPS_THAT_CAN_BE_PRESERVED_TESTING_ONLY
-        )
     return ep.run_decompositions({})
 
 
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index 9788e8fc68ad4..0541e2366e898 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -273,13 +273,9 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
         _collect_all_valid_cia_ops_for_aten_namespace,
         _get_decomp_for_cia,
     )
-    from torch._inductor import config
 
     # Entry without functional CIA ops
     decomp_table = _core_aten_decompositions_post_autograd()
-    if config.is_fbcode():
-        return decomp_table
-
     for op in _collect_all_valid_cia_ops_for_aten_namespace():
         decomp_table[op] = _get_decomp_for_cia(op)
     return decomp_table
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 6bddbf65163ce..c9214494ab50d 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -47,7 +47,6 @@
     _collect_all_valid_cia_ops,
     _collect_and_set_constant_attrs,
     _collect_param_buffer_metadata,
-    _decomp_table_to_post_autograd_aten,
     _detect_fake_mode_from_gm,
     _get_decomp_for_cia,
     _is_preservable_cia_op,
@@ -1054,7 +1053,6 @@ def _num_lifted_params_buffers(self):
     def run_decompositions(
         self,
         decomp_table: Optional[Dict[torch._ops.OperatorBase, Callable]] = None,
-        _preserve_ops: Tuple[torch._ops.OpOverload, ...] = (),
     ) -> "ExportedProgram":
         """
         Run a set of decompositions on the exported program and returns a new
@@ -1089,31 +1087,10 @@ def run_decompositions(
             decomp_table[your_op] = your_custom_decomp
             ep = ep.run_decompositions(decomp_table=decomp_table)
         """
-        from torch._inductor import config
-
-        # FIXME delete this option after PTC, Executorch syncing is
-        # bit annoying so can't get rid of it easily
-        if _preserve_ops != ():
-            warnings.warn(
-                "This API is deprecated and soon will be removed. "
-                "Please look at the docstring to see how to preserve "
-                "an operator."
-            )
-
         _decomp_table = (
             default_decompositions() if decomp_table is None else dict(decomp_table)
         )
 
-        if config.is_fbcode():
-            # This means the decomp_table would only be containing post-autograd ops
-            # We should manually add CIA decomps
-            for k, v in _decomp_table_to_post_autograd_aten().items():
-                _decomp_table[k] = v
-
-        for op in _preserve_ops:
-            if op in _decomp_table:
-                del _decomp_table[op]
-
         if isinstance(_decomp_table, CustomDecompTable):
             _decomp_table = _decomp_table.materialize()
 

From a874ec85e83cfe75e7238296022d53d7e20860df Mon Sep 17 00:00:00 2001
From: Shivam Raikundalia <sraikund@meta.com>
Date: Fri, 25 Oct 2024 19:25:18 +0000
Subject: [PATCH 087/161] [Functorch] Fix devices Parameter Type in
 benchmark_utilization Function (#138774)

Summary:
Issue described in https://github.com/pytorch/pytorch/issues/136697

Original user does not have CLA privileges so this is my commandeer

Test Plan: OSS CI

Differential Revision: D64872833

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138774
Approved by: https://github.com/davidberard98
---
 torch/_functorch/benchmark_utils.py | 2 +-
 torch/_prims_common/wrappers.py     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/_functorch/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
index e0bcae4c836e9..ac69e8bd4744c 100644
--- a/torch/_functorch/benchmark_utils.py
+++ b/torch/_functorch/benchmark_utils.py
@@ -222,7 +222,7 @@ def f(a):
         optimize_ctx,
         [ProfilerActivity.CUDA],
         num_runs=num_runs,
-        devices="cuda",
+        devices=["cuda"],
     )
     utilization, mm_conv_utilization = compute_utilization(
         chrome_trace_file_name, total_length
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 85ba19a44da94..865925e7dadd6 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -68,7 +68,9 @@ def _maybe_convert_to_dtype(a, dtype):
     if a is None:
         return None
 
-    raise ValueError(f"Received type {type(a)} that is neither a tensor or a number!")
+    raise ValueError(
+        f"Received unsupported type {type(a)}. Expected TensorLike, Number, or Sequence."
+    )
 
 
 def _maybe_convert_to_type(a: NumberType, typ: type) -> NumberType:

From 78377ec1307912e611033ae93a9c84d244fe068b Mon Sep 17 00:00:00 2001
From: Menglu Yu <mengluy@meta.com>
Date: Fri, 25 Oct 2024 21:05:39 +0000
Subject: [PATCH 088/161] [PT2][Optimus] Normalize Clamp to use kwargs
 (#138723)

Summary: The current clamp normalization does not include torch.clamp where its min and max are not normalized to kwargs, thus the batch fusion of clamp can hit min and max are both empty problem.

Test Plan:
```
buck2 run mode/opt servicelab/ai_ml/auto_tune:local_model_pt2 -- --flow_id 654509735 --test_mode split
```

GPU type: NVIDIA PG509-210
=============Print full analysis for offsite_cvr_oba_optout_dedicated_model================
| Metric             | Value            |
|:-------------------|:-----------------|
| GPU type           | A100             |
| Batch size         | 10               |
| Latency            | 227.13 ms        |
| Model size         | 2322763344 bytes |
| Flops/example      | 1136.52 G        |
| TFLOPS             | 50.04            |
| MFU                | 16.04%           |
| Activation/example | 2722.49 MB       |
I1023 112249.043 local_model_with_pt2.py:25] benchmark results [('batch_size', 10), ('latency_ms', 22712), ('model_size_bytes', 2322763344), ('flops_per_example', 113652), ('tflops_g', 5003), ('mfu', 1603), ('activation_per_example_mb', 272249)

Differential Revision: D64848369

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138723
Approved by: https://github.com/jackiexu1992
---
 torch/_inductor/fx_passes/split_cat.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index 33d38f367d803..46f990f7d9af0 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -469,6 +469,10 @@ def normalize_reshape_default(match: Match, *args, **kwargs):
     CallMethodVarArgs("clamp", users=MULTIPLE),
     pass_dict=construct_pattern_matcher_pass("normalization_pass"),
 )
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.clamp, users=MULTIPLE),
+    pass_dict=construct_pattern_matcher_pass("normalization_pass"),
+)
 def normalize_clamp_default(match: Match, *args, **kwargs):
     clamp_node = match.nodes[0]
     if not is_node_meta_valid(clamp_node):
@@ -478,12 +482,20 @@ def normalize_clamp_default(match: Match, *args, **kwargs):
     if free_symbols(clamp_node.meta["example_value"].shape):
         log.debug("dynamic shape not supported: %s", clamp_node)
         return
-
+    if len(clamp_node.args) > 1:
+        args = (get_arg_value(clamp_node, 0),)
+        kwargs = {
+            "min": get_arg_value(clamp_node, 1, kwarg_name="min"),
+            "max": get_arg_value(clamp_node, 2, kwarg_name="max"),
+        }
+    else:
+        args = clamp_node.args
+        kwargs = clamp_node.kwargs
     with match.graph.inserting_after(clamp_node):
         new_clamp_node = match.graph.call_function(
             torch.clamp,
-            args=clamp_node.args,
-            kwargs=clamp_node.kwargs,
+            args=args,
+            kwargs=kwargs,
         )
     clamp_node.replace_all_uses_with(new_clamp_node)
     new_clamp_node.meta.update(clamp_node.meta)

From 86b45bde196b976247f2771196f8c1c689dd66b7 Mon Sep 17 00:00:00 2001
From: Sam Larsen <slarsen@meta.com>
Date: Fri, 25 Oct 2024 21:30:18 +0000
Subject: [PATCH 089/161] [pt2] Add logger logging for remote fx graph cache
 get + put (#138164)

Summary: Capture the timing for the remote fx graph cache get and put operations and add them to the logger logging.

Test Plan:
1) Landed D64483593 and waited for logger actualization.
2) Ran test script on devserver: `buck2 run mode/opt scripts/slarsen/torch_compile_model:run`
3) Queried dynamo_compile/sandbox:
```
(pytorch-3.10_4) devvm2296:~/local/pytorch-3.10_4  $ scuba -e="select time,co_filename,remote_fx_graph_cache_get_time_s,remote_fx_graph_cache_put_time_s from \`dynamo_compile/sandbox\` where remote_fx_graph_cache_put_time_s is not null"
+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+----------------------------------+
|    time    |                                                                                    co_filename                                                                                    | remote_fx_graph_cache_get_time_s | remote_fx_graph_cache_put_time_s |
+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+----------------------------------+
| 1729136266 | null                                                                                                                                                                              |              0.05652284622192383 |               0.9691152572631836 |
| 1729136263 | /data/users/slarsen/fbsource/buck-out/v2/gen/fbcode/289bb46b326874c6/scripts/slarsen/torch_compile_model/__run__/run-inplace#link-tree/scripts/slarsen/torch_compile_model/run.py |               0.8298435211181641 |              0.18642282485961914 |
+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+----------------------------------+
```

Reviewed By: oulgen

Differential Revision: D64484025

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138164
Approved by: https://github.com/jamesjwu, https://github.com/ezyang
---
 test/dynamo/test_structured_trace.py |  2 +-
 torch/_dynamo/convert_frame.py       | 11 +++++++++++
 torch/_dynamo/utils.py               | 18 ++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index c0999b155ba0a..4e5c04d399fbf 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -368,7 +368,7 @@ def test_example_training_fn(self):
 {"inductor_post_grad_graph": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"bwd_compilation_metrics": {"compile_id": "2/0", "inductor_compile_time_s": <dynamic>, "code_gen_time_s": <dynamic>, "fail_type": null, "fail_reason": null, "remote_cache_time_saved_s": null, "structured_logging_overhead_s": <dynamic>, "is_forward": false}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
+{"bwd_compilation_metrics": {"compile_id": "2/0", "inductor_compile_time_s": <dynamic>, "code_gen_time_s": <dynamic>, "fail_type": null, "fail_reason": null, "remote_cache_time_saved_s": null, "structured_logging_overhead_s": <dynamic>, "is_forward": false, "remote_fx_graph_cache_get_time_ms": null, "remote_fx_graph_cache_put_time_ms": null}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 42ddc36ad642e..856e399a6d8af 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -119,6 +119,7 @@
     record_compilation_metrics,
     reset_graph_break_dup_checker,
     setup_compile_debug,
+    to_int_ms,
     troubleshooting_url,
     write_record_to_file,
 )
@@ -1067,6 +1068,12 @@ def format_guard_failures() -> str:
                         "auto_functionalize",
                         {"missed_reinplacing_bytes": possibly_missed_reinplacing_bytes},
                     )
+                remote_fx_graph_cache_get_time = frame_phase_timing[frame_key].get(
+                    "remote_fx_graph_cache_get", None
+                )
+                remote_fx_graph_cache_put_time = frame_phase_timing[frame_key].get(
+                    "remote_fx_graph_cache_put", None
+                )
             else:
                 guard_count = None
                 shape_env_guard_count = None
@@ -1084,6 +1091,8 @@ def format_guard_failures() -> str:
                 dynamo_time_before_restart = time.time() - start_time
                 possibly_missed_reinplacing_opportunities = None
                 remote_cache_time_saved = None
+                remote_fx_graph_cache_get_time = None
+                remote_fx_graph_cache_put_time = None
 
             structured_logging_overhead_s = (
                 torch._logging.get_structured_logging_overhead()
@@ -1136,6 +1145,8 @@ def handle_sets(d: Dict[str, Any]) -> Dict[str, Any]:
                 config.specialize_float,
                 json.dumps(config_dict),
                 True,  # is_forward
+                to_int_ms(remote_fx_graph_cache_get_time),
+                to_int_ms(remote_fx_graph_cache_put_time),
             )
             record_compilation_metrics(metrics)
             torch._dynamo.callback_handler.run_end_callbacks()
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 445f4a1fb0ac6..696cf371b4233 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -343,10 +343,18 @@ def dynamo_timed(
                                 remote_cache_time_saved = frame_phase_timing[
                                     compile_id
                                 ].get("remote_cache_time_saved", None)
+                                remote_fx_graph_cache_get_time = frame_phase_timing[
+                                    compile_id
+                                ].get("remote_fx_graph_cache_get", None)
+                                remote_fx_graph_cache_put_time = frame_phase_timing[
+                                    compile_id
+                                ].get("remote_fx_graph_cache_put", None)
                             else:
                                 inductor_compile_time = None
                                 code_gen_time = None
                                 remote_cache_time_saved = None
+                                remote_fx_graph_cache_get_time = None
+                                remote_fx_graph_cache_put_time = None
                             structured_logging_overhead_s = (
                                 torch._logging.get_structured_logging_overhead()
                             )
@@ -359,6 +367,8 @@ def dynamo_timed(
                                 remote_cache_time_saved,
                                 structured_logging_overhead_s,
                                 False,  # is_forward
+                                to_int_ms(remote_fx_graph_cache_get_time),
+                                to_int_ms(remote_fx_graph_cache_put_time),
                             )
                             record_compilation_metrics(metrics)
 
@@ -765,6 +775,10 @@ def proxy_args_kwargs(args, kwargs):
         )
 
 
+def to_int_ms(v: Optional[float]) -> Optional[int]:
+    return None if v is None else int(v * 1000)
+
+
 @dataclasses.dataclass
 class CompilationMetrics:
     compile_id: str
@@ -804,6 +818,8 @@ class CompilationMetrics:
     specialize_float: Optional[bool]
     dynamo_config: Optional[str]
     is_forward: Optional[bool]
+    remote_fx_graph_cache_get_time_ms: Optional[int]
+    remote_fx_graph_cache_put_time_ms: Optional[int]
 
 
 @dataclasses.dataclass
@@ -816,6 +832,8 @@ class BwdCompilationMetrics:
     remote_cache_time_saved_s: Optional[float]
     structured_logging_overhead_s: Optional[float]
     is_forward: Optional[bool]
+    remote_fx_graph_cache_get_time_ms: Optional[int]
+    remote_fx_graph_cache_put_time_ms: Optional[int]
 
 
 DEFAULT_COMPILATION_METRICS_LIMIT = 64

From 14b8028c814bd2bd41e55a6593da86d6fd561617 Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Fri, 25 Oct 2024 21:56:47 +0000
Subject: [PATCH 090/161] [Pytorch][ATEN] Enable FP8 NCCL in Pytorch ATEN
 (#138776)

Summary: Enable FP8 NCCL in Pytorch ATEN to unblock FP8 collective communication such as FP8 all-to-all

Test Plan: CI & D64374424

Differential Revision: D64866426

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138776
Approved by: https://github.com/eqy, https://github.com/jianyuh
---
 test/distributed/test_nccl.py |  9 ++++++++-
 torch/csrc/cuda/nccl.cpp      | 12 ++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index ebf03e7ae1ddd..f9bb4f6543ee5 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -45,6 +45,13 @@
 ) or TEST_WITH_ROCM:
     datatypes.append(torch.bfloat16)
 
+# Broadcast (and alltoall) support float8, while reduce and allreduce do not support float8 currently
+broadcast_dtypes = (
+    datatypes + [torch.float8_e4m3fnuz, torch.float8_e5m2fnuz]
+    if TEST_WITH_ROCM
+    else [torch.float8_e4m3fn, torch.float8_e5m2]
+)
+
 
 class TestNCCL(TestCase):
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
@@ -58,7 +65,7 @@ def test_unique_id(self, device):
     )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
-    @dtypes(*datatypes)
+    @dtypes(*broadcast_dtypes)
     def test_broadcast(self, device, dtype):
         expected = torch.zeros(128).uniform_().to(dtype=dtype)
         tensors = [expected.cuda()]
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index e02b6a1840f13..a426d9043fa66 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -113,6 +113,18 @@ ncclDataType_t to_nccl_data_type(c10::ScalarType type) {
       return ncclDataType_t::ncclUint8;
     case at::kBool:
       return ncclDataType_t::ncclUint8;
+#if defined(USE_ROCM)
+    case at::kFloat8_e4m3fnuz:
+      return ncclDataType_t::ncclUint8;
+    case at::kFloat8_e5m2fnuz:
+      return ncclDataType_t::ncclUint8;
+#else
+    case at::kFloat8_e4m3fn:
+      return ncclDataType_t::ncclUint8;
+    case at::kFloat8_e5m2:
+      return ncclDataType_t::ncclUint8;
+#endif
+
 #if HAS_NCCL_BF16_DATATYPE
     case at::kBFloat16:
       return ncclDataType_t::ncclBfloat16;

From 36b7135c6ff7ed1be4203968888ba4dd7ddbb3c6 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 25 Oct 2024 22:07:30 +0000
Subject: [PATCH 091/161] Revert "[fx graph cache] FxGraphPickler: Remove hack
 to stabilize device string hashes (#138681)"

This reverts commit 6cadf616aeb612f3c866b734268919ad1616ffaf.

Reverted https://github.com/pytorch/pytorch/pull/138681 on behalf of https://github.com/jeanschmidt due to Introduced regressions on linux-focal-cuda11.8-py3.10-gcc9 ([comment](https://github.com/pytorch/pytorch/pull/138681#issuecomment-2438945493))
---
 test/inductor/test_codecache.py               | 16 ---------
 .../_aot_autograd/autograd_cache.py           | 10 ++++--
 torch/_inductor/codecache.py                  | 35 ++++++++++++++-----
 3 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index b7b7f11ccea9b..70d1ae48f7cb0 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -835,22 +835,6 @@ def uuid(self) -> Optional[Union[bytes, str]]:
                 FxGraphCachePickler.dumps(details3),
             )
 
-    def test_stable_strings(self):
-        """
-        Test that objects containing identical strings pickle the same
-        even if they are not the same id.
-        """
-        s1 = "string"
-        s2 = "strin"
-        s2 += "g"
-
-        self.assertNotEqual(id(s1), id(s2))
-
-        self.assertEqual(
-            FxGraphCachePickler.dumps([s1, s1]),
-            FxGraphCachePickler.dumps([s1, s2]),
-        )
-
     def test_get_hash_for_files(self):
         """
         Test the get_hash_for_files helper.
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index aaf05634b343b..9512e6561a438 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -250,8 +250,14 @@ def _reduce_tensor(tensor):
     """
     Reduce the tensor to a stable key for caching.
     """
-    metadata = extract_tensor_metadata_for_cache_key(tensor)
-    return (_ident, (metadata,))
+    return (
+        _ident,
+        (
+            extract_tensor_metadata_for_cache_key(
+                FxGraphCachePickler._device_map, tensor
+            ),
+        ),
+    )
 
 
 class AOTAutogradCachePickler(FxGraphCachePickler):
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 618f7a5d10840..c914c6a7338bd 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -506,7 +506,9 @@ def _ident(x: T) -> T:
     return x
 
 
-def extract_tensor_metadata_for_cache_key(t: Tensor) -> TensorMetadata:
+def extract_tensor_metadata_for_cache_key(
+    device_map: Dict[torch.device, torch.device], t: Tensor
+) -> TensorMetadata:
     """
     Extracts the tensor metadata and removes fields of the TensorMetadata
     that are not needed for caching
@@ -515,19 +517,32 @@ def extract_tensor_metadata_for_cache_key(t: Tensor) -> TensorMetadata:
     if not hasattr(t, "_is_inductor_static"):
         meta = dataclasses.replace(meta, storage_offset=0, storage_bytes=None)
 
+    # The pickle implementation avoids serializing the same object more than once.
+    # That behavior means the byte stream we create to hash will vary if, for example,
+    # we see two tensor objects with the same device, but the torch.device object is
+    # actually the same object vs. merely equivalent. We want to produce the same hash
+    # value in either situation, so we memoize the device objects and always reference
+    # the same object for a given device. It's possible other metadata fields deserve
+    # the same treatment, but so far we've only observed this issue with the device.
+    if meta.device not in device_map:
+        device_map[meta.device] = meta.device
+    meta = dataclasses.replace(meta, device=device_map[meta.device])
+
     return meta
 
 
-def _reduce_fake_tensor(t: Tensor) -> Tuple[Callable[[T], T], Tuple[TensorMetadata]]:
+def _reduce_fake_tensor(
+    device_map: Dict[torch.device, torch.device], t: Tensor
+) -> Tuple[Callable[[T], T], Tuple[TensorMetadata]]:
     """
     See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
     """
-    metadata = extract_tensor_metadata_for_cache_key(t)
+    metadata = extract_tensor_metadata_for_cache_key(device_map, t)
     return (_ident, (metadata,))
 
 
 def _reduce_tensor(
-    t: Tensor,
+    device_map: Dict[torch.device, torch.device], t: Tensor
 ) -> Tuple[Callable[[T], T], Tuple[TensorMetadataAndValues]]:
     """
     See FxGraphCachePickler. Custom reducer to pickle Tensors.
@@ -555,7 +570,7 @@ def _reduce_tensor(
             f"FX graph cache handling of a large constant took {elapsed:.1}s. Please file an issue."
         )
 
-    metadata = extract_tensor_metadata_for_cache_key(t)
+    metadata = extract_tensor_metadata_for_cache_key(device_map, t)
     return (_ident, (TensorMetadataAndValues(metadata, values),))
 
 
@@ -585,9 +600,13 @@ class FxGraphCachePickler(pickle.Pickler):
     data that allow us to compute a stable, but safe hash.
     """
 
+    # See extract_tensor_metadata_for_cache_key. Whenever we extract metadata during
+    # pickling, we make sure devices always reference the same torch.device object.
+    _device_map: Dict[torch.device, torch.device] = {}
+
     dispatch_table = copyreg.dispatch_table.copy()
-    dispatch_table[FakeTensor] = _reduce_fake_tensor
-    dispatch_table[torch.Tensor] = _reduce_tensor
+    dispatch_table[FakeTensor] = functools.partial(_reduce_fake_tensor, _device_map)
+    dispatch_table[torch.Tensor] = functools.partial(_reduce_tensor, _device_map)
     dispatch_table[torch.SymInt] = _reduce_symint
     dispatch_table[
         torch.fx.experimental._backward_state.BackwardState
@@ -629,7 +648,7 @@ def debug_lines(cls, inp: FxGraphHashDetails) -> List[str]:
 
         def get_str(obj: Any) -> str:
             if isinstance(obj, torch.Tensor):
-                return str(extract_tensor_metadata_for_cache_key(obj))
+                return str(extract_tensor_metadata_for_cache_key(cls._device_map, obj))
             elif isinstance(obj, bytes):
                 return "<bytes>"
             elif type(obj) in cls.dispatch_table:

From 4d92d6e60436b1aeffbf4dfce51f16923505251b Mon Sep 17 00:00:00 2001
From: Max Podkorytov <4273004+tenpercent@users.noreply.github.com>
Date: Fri, 25 Oct 2024 22:11:44 +0000
Subject: [PATCH 092/161] [Inductor][ROCm][CK] Enable lowering conv2d instances
 in CK Inductor backend (#138643)

Set PYTORCH_MIOPEN_SUGGEST_NHWC environment variable to force output layout to channels-last.

This way, the channels-last CK instances will be added to benchmark choices in max autotune

# Testing
```
pytest test/inductor/test_ck_backend.py -k conv2d
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138643
Approved by: https://github.com/chenyang78
---
 test/inductor/test_ck_backend.py | 38 ++++++++++++++++++++++++++++++++
 torch/_inductor/kernel/conv.py   | 14 +++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index bf386df514f80..3d51f621466d4 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -364,6 +364,44 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
 
             torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @unittest.mock.patch.dict(
+        os.environ,
+        {"PATH": _get_path_without_sccache(), "PYTORCH_MIOPEN_SUGGEST_NHWC": "1"},
+    )
+    @parametrize("max_autotune_conv_backends", ("CK", "ATEN,CK,TRITON"))
+    def test_max_autotune_conv2d(self, max_autotune_conv_backends):
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
+        tensor_options = {"device": "cuda", "dtype": torch.float32}
+
+        x = torch.randn(1, 8, 224, 224, **tensor_options)
+        w = torch.randn(64, 8, 7, 7, **tensor_options)
+        x_cl = x.to(memory_format=torch.channels_last)
+        w_cl = w.to(memory_format=torch.channels_last)
+
+        assert "rocm" in dir(config)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "autotune_in_subproc": False,
+                "max_autotune_conv_backends": max_autotune_conv_backends,
+                "compile_threads": 4,
+                "rocm.ck_dir": self.ck_dir,
+                "rocm.n_max_profiling_configs": 4,
+            }
+        ):
+
+            @torch.compile(dynamic=False)
+            def conv2d(x, w):
+                return torch.conv2d(x, w)
+
+            Y_eager = torch.conv2d(x_cl, w_cl)
+            Y_compiled = conv2d(x_cl, w_cl)
+
+            torch.testing.assert_close(Y_compiled, Y_eager, atol=2e-4, rtol=2e-4)
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py
index 71e3a21b005ef..94e9b86ea9405 100644
--- a/torch/_inductor/kernel/conv.py
+++ b/torch/_inductor/kernel/conv.py
@@ -6,6 +6,7 @@
 from typing import cast, List, Optional, Sequence, Tuple, TYPE_CHECKING, TypedDict
 
 import torch
+from torch._inductor.codegen.rocm.ck_conv_template import CKGroupedConvFwdTemplate
 
 from .. import config, ir
 from ..lowering import (
@@ -25,6 +26,7 @@
     is_zeros,
     pad_listlike,
     sympy_product,
+    use_ck_conv_template,
     use_triton_template,
 )
 from ..virtualized import V
@@ -659,7 +661,17 @@ def channels_last_conv():
                     num_warps=cfg.num_warps,
                     **cfg.kwargs,
                 )
-
+    if use_ck_conv_template(layout):
+        CKGroupedConvFwdTemplate.add_ck_conv_choices(
+            choices,
+            layout,
+            input_nodes=(x, weight) + ((bias,) if bias is not None else tuple()),
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            n_spatial_dimensions=ndim,
+        )
     return autotune_select_algorithm("convolution", choices, args, layout)
 
 

From a57e418c1f7f0c0fab1775b9f188d86d4901b715 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Thu, 24 Oct 2024 17:55:28 -0700
Subject: [PATCH 093/161] [PGNCCL] Use ncclSend and ncclRecv (#138875)

Stop routing to `torch::cuda::nccl`. Use native `ncclSend` and `ncclRecv` APIs instead.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138875
Approved by: https://github.com/shuqiangzhang
---
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 8b29261d733e3..9610184efc98f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -4579,8 +4579,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
           ncclComm_t comm,
           at::cuda::CUDAStream& stream,
           int dst) {
-        torch::cuda::nccl::send(input, comm, stream, dst);
-        return ncclSuccess;
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        return ncclSend(
+            input.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            dst,
+            comm,
+            stream.stream());
       },
       dstRank,
       OpType::SEND,
@@ -4622,8 +4628,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
           ncclComm_t comm,
           at::cuda::CUDAStream& stream,
           int src) {
-        torch::cuda::nccl::recv(output, comm, stream, src);
-        return ncclSuccess;
+        auto ncclDataType = getNcclDataType(output.scalar_type());
+        return ncclRecv(
+            output.data_ptr(),
+            output.numel(),
+            ncclDataType,
+            src,
+            comm,
+            stream.stream());
       },
       srcRank,
       OpType::RECV,

From 54d13a93484f7779d1042ff4a6a3edad1579bc35 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Fri, 25 Oct 2024 13:42:37 -0700
Subject: [PATCH 094/161] [c10d][CI] Improve world size setting in some tests
 (#138846)

Following change in #137161 , bumping world size for some test suites.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138846
Approved by: https://github.com/fduwjj
---
 .../test_c10d_object_collectives.py           |  7 ++++---
 test/distributed/test_c10d_ops_nccl.py        | 20 +++++++++++++++----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index ece50ebe8890b..dcd6de797e725 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -24,7 +24,6 @@
     sys.exit(0)
 
 BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO
-WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
 
 
 def with_comms(func=None):
@@ -54,14 +53,16 @@ def setUp(self):
     @property
     def device(self):
         return (
-            torch.device(self.rank)
+            torch.device("cuda", self.rank % torch.cuda.device_count())
             if BACKEND == dist.Backend.NCCL
             else torch.device("cpu")
         )
 
     @property
     def world_size(self):
-        return WORLD_SIZE
+        if BACKEND == dist.Backend.NCCL:
+            return torch.cuda.device_count()
+        return super().world_size
 
     @property
     def process_group(self):
diff --git a/test/distributed/test_c10d_ops_nccl.py b/test/distributed/test_c10d_ops_nccl.py
index c9fb0f30b53f9..f0249877c63bb 100644
--- a/test/distributed/test_c10d_ops_nccl.py
+++ b/test/distributed/test_c10d_ops_nccl.py
@@ -28,6 +28,7 @@
     init_multigpu_helper,
     MultiProcContinousTest,
     requires_nccl,
+    TEST_SKIPS,
 )
 from torch.testing._internal.common_utils import (
     skip_but_pass_in_sandcastle_if,
@@ -278,16 +279,21 @@ def test_allreduce_in_cudagraph(self):
 
             # single warmup
             pg.allreduce(xs).wait()
-            self.assertEqual(xs[0].item(), 2)
+            # 1 + 1 + ...  = world_size
+            expected_val = self.world_size
+            self.assertEqual(xs[0].item(), expected_val)
 
             graph = torch.cuda.CUDAGraph()
             with torch.cuda.graph(graph):
                 pg.allreduce(xs).wait()
-            self.assertEqual(xs[0].item(), 2)
+            # Graph capture should not change the tensor value
+            self.assertEqual(xs[0].item(), expected_val)
 
             graph.replay()
+            expected_val *= self.world_size
             graph.replay()
-            self.assertEqual(xs[0].item(), 8)
+            expected_val *= self.world_size
+            self.assertEqual(xs[0].item(), expected_val)
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -979,8 +985,14 @@ def allgather_base(output_t, input_t):
 
 
 if __name__ == "__main__":
+    if not torch.cuda.is_available():
+        sys.exit(TEST_SKIPS["no_cuda"].exit_code)
+
     rank = int(os.getenv("RANK", -1))
-    world_size = int(os.getenv("WORLD_SIZE", 2))
+    world_size = int(os.getenv("WORLD_SIZE", -1))
+
+    if world_size == -1:  # Not set by external launcher
+        world_size = torch.cuda.device_count()
 
     if rank != -1:
         # Launched with torchrun or other multi-proc launchers. Directly run the test.

From 939fc4e335bcfc8dd61ff4f08adf688ebaef8130 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Thu, 24 Oct 2024 17:25:40 -0700
Subject: [PATCH 095/161] [PGNCCL] Fix P2P data corruption in non-blocking mode
 (#138860)

In non-blocking mode, it seems a single `ncclRecv` or `ncclSend` call can "early return" `ncclSuccess` before the kernel is fully enqueued. This causes the event record below missing the P2P the kernel, leading to data corruption.

Side note: per NCCL, it is legal to call `ncclSend` or `ncclRecv` only if there is only one P2P op. This is true whether we are in blocking or non-blocking mode.

In this fix, we use ncclGroup semantics to ensure that the kernel is enqueued for single-P2P ops. The ncclGroup call itself should introduce minimal overhead.

Added a test `test_non_blocking_p2p`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138860
Approved by: https://github.com/shuqiangzhang
---
 test/distributed/test_c10d_nccl.py             | 18 ++++++++++++++++++
 .../csrc/distributed/c10d/ProcessGroupNCCL.cpp | 12 ++++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 027faceb43dd2..6d81901a7a66c 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -966,6 +966,24 @@ def test_non_blocking_with_eager_init(self):
         self.assertEqual(backend.comm_split_count(), 1)
         dist.destroy_process_group()
 
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_non_blocking_p2p(self):
+        # Test creating a pg using nonblocking mode but not eagerly
+        os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+        os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = self.rank_to_GPU[self.rank][0]
+        self._create_process_group_nccl(store, self.opts())
+        # Generate the same tensor
+        send_tensor = torch.ones(10, 10, device=device)
+        if self.rank == 0:
+            dist.send(send_tensor, 1)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, device=device)
+            dist.recv(recv_tensor, 0)
+            self.assertEqual(send_tensor, recv_tensor)
+        dist.destroy_process_group()
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_get_uid(self):
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 9610184efc98f..6206b4d6c5994 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -3380,10 +3380,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
       fn(tensor, comm_, ncclStream, p2pTargetRank),
       ncclComm->getNcclCommFailureReason());
 #else
-  C10D_NCCL_CHECK_TIMEOUT(
-      fn(tensor, comm_, ncclStream, p2pTargetRank),
-      ncclComm->getNcclComm(),
-      ncclComm->getNcclCommFailureReason());
+  // In non-blocking mode, we need to use ncclGroup semantics to ensure that the
+  // kernel is enqueued for single-P2P ops.  Otherwise, the event record below
+  // may not capture the kernel, leading to data corruption.
+  ncclGroupStart();
+  C10D_NCCL_CHECK_NONBLOCKING(
+      fn(tensor, comm_, ncclStream, p2pTargetRank), std::nullopt);
+  C10D_NCCL_CHECK_TIMEOUT_GROUPEND(
+      ncclGroupEnd(), ncclComm, ncclComm->getNcclCommFailureReason());
 #endif
 
   if (!coalescing_state_) {

From 1605d4aeb80c15c48f74ca8a82485addf26c9e53 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sat, 26 Oct 2024 00:13:19 +0000
Subject: [PATCH 096/161] Fix object slice (#138880)

To avoid casting Tensor to Tensorbase

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138880
Approved by: https://github.com/Skylion007
---
 functorch/csrc/dim/dim.cpp                  |  2 +-
 functorch/csrc/dim/python_variable_simple.h |  2 +-
 torch/csrc/autograd/python_variable.cpp     | 31 ++++++++++-----------
 torch/csrc/autograd/python_variable.h       |  2 +-
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index 47fe87c235261..304839cbaeedb 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -867,7 +867,7 @@ mpy::object Tensor::from_positional(Arena & A, at::Tensor tensor, Slice<DimEntry
     }
     AT_ASSERT(last == 0 || last == -1);
     if (!seen_dims) {
-        return mpy::object::steal(THPVariable_Wrap(std::move(tensor)));
+        return mpy::object::steal(THPVariable_Wrap(tensor));
     }
 
     mpy::obj<Tensor> self = Tensor::create();
diff --git a/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
index caae566107600..fbd5cfd828157 100644
--- a/functorch/csrc/dim/python_variable_simple.h
+++ b/functorch/csrc/dim/python_variable_simple.h
@@ -26,7 +26,7 @@ struct THPVariable {
 TORCH_PYTHON_API extern PyObject *THPVariableClass;
 TORCH_PYTHON_API extern PyObject *ParameterClass;
 
-TORCH_PYTHON_API PyObject * THPVariable_Wrap(at::TensorBase var);
+TORCH_PYTHON_API PyObject * THPVariable_Wrap(const at::TensorBase& var);
 
 inline bool THPVariable_Check(PyObject *obj)
 {
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index d9c4ca0dc065e..8f113a6a70286 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -207,7 +207,7 @@ PyObject* ParameterClass = nullptr;
 
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
-    Variable _var,
+    const at::TensorBase& _var,
     c10::impl::PyInterpreterStatus status,
     bool allow_preexisting_pyobj = false);
 
@@ -254,8 +254,7 @@ void activateGPUTrace() {
   c10::impl::GPUTrace::set_trace(getPyInterpreter());
 }
 
-// TODO: Make this take Variable by const reference
-PyObject* THPVariable_Wrap(at::TensorBase var) {
+PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   if (!var.defined()) {
     Py_RETURN_NONE;
   }
@@ -263,7 +262,7 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
   if (c10::impl::HermeticPyObjectTLS::get_state()) {
     return THPVariable_NewWithVar(
         (PyTypeObject*)THPVariableClass,
-        std::move(var),
+        var,
         c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
   }
 
@@ -282,7 +281,7 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
         // object if all C++ references go to zero
         var.unsafeGetTensorImpl()->pyobj_slot()->set_owns_pyobj(false);
         reinterpret_cast<THPVariable*>(obj)->cdata =
-            MaybeOwned<Variable>::owned(std::move(var));
+            MaybeOwned<Variable>::owned(Variable(var));
         // NB: incref is not necessary, because we are "stealing" the previous
         // ownership from the Variable to return it here for the wrap
         return obj;
@@ -308,16 +307,14 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
   }
 
   if (C10_LIKELY(var.device().type() != c10::kXLA)) {
-    return THPVariable_NewWithVar(
-        (PyTypeObject*)THPVariableClass, std::move(var), status);
+    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
   }
 
   if (auto clazz = getPythonTensorClass(var.device())) {
-    return THPVariable_NewWithVar((PyTypeObject*)clazz, std::move(var), status);
+    return THPVariable_NewWithVar((PyTypeObject*)clazz, var, status);
   }
 
-  return THPVariable_NewWithVar(
-      (PyTypeObject*)THPVariableClass, std::move(var), status);
+  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
 }
 
 bool isResurrectable(THPVariable* self) {
@@ -619,7 +616,7 @@ static PyObject* view_func_impl(
       }
     }
   }
-  return THPVariable_Wrap(std::move(out));
+  return THPVariable_Wrap(out);
   END_HANDLE_TH_ERRORS
 }
 
@@ -655,7 +652,7 @@ static PyObject* rev_view_func_impl(PyObject* self_, PyObject* arg) {
     TORCH_CHECK(view_info.has_view_fn(), "No _rev_view_func() found");
     out = view_info.rev_view_fn()(new_view);
   }
-  return THPVariable_Wrap(std::move(out));
+  return THPVariable_Wrap(out);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1898,7 +1895,7 @@ PyObject* THPVariable_pynew(
   // these to be passed on directly.
   return THPVariable_NewWithVar(
       type,
-      std::move(tensor),
+      tensor,
       c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED,
       /*allow_preexisting_pyobj=*/true);
   END_HANDLE_TH_ERRORS
@@ -2012,7 +2009,7 @@ void THPVariable_subclass_dealloc(PyObject* self) {
 // It's ALWAYS safe (albeit slower) to call this with MAYBE_UNINITIALIZED.
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
-    Variable _var,
+    const at::TensorBase& _var,
     c10::impl::PyInterpreterStatus status,
     bool allow_preexisting_pyobj) {
   // Make sure that the reinterpret into a THPVariable* will be valid
@@ -2082,7 +2079,7 @@ static PyObject* THPVariable_NewWithVar(
         " which is not a subclass of the "
         "requested type");
     // We may (in fact, we typically will) need to resurrect this
-    return THPVariable_Wrap(std::move(_var));
+    return THPVariable_Wrap(_var);
   }
 
   PyObject* obj = type->tp_alloc(type, 0);
@@ -2092,7 +2089,7 @@ static PyObject* THPVariable_NewWithVar(
     new (&v->cdata) MaybeOwned<Variable>();
     if (c10::impl::HermeticPyObjectTLS::get_state()) {
       // Do NOT initialize pyobj field on the tensor, you own the C++
-      v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
+      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
       TORCH_INTERNAL_ASSERT(
           !check_has_torch_dispatch(obj),
           "While HermeticPyObject was enabled, we attempted to create a tensor "
@@ -2104,7 +2101,7 @@ static PyObject* THPVariable_NewWithVar(
           "Python op registration.");
     } else {
       // Normal codepath
-      v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
+      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
       const auto& var = THPVariable_Unpack(v);
       var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
           getPyInterpreter(), obj, status);
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index 51ade77f03ece..32cc5c930ca0a 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -37,7 +37,7 @@ TORCH_PYTHON_API extern PyObject* THPVariableClass;
 TORCH_PYTHON_API extern PyObject* ParameterClass;
 
 bool THPVariable_initModule(PyObject* module);
-TORCH_PYTHON_API PyObject* THPVariable_Wrap(at::TensorBase var);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(const at::TensorBase& var);
 
 inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
   // Check that a python object is a `Tensor`, but not a `Tensor` subclass.

From 7ada81410756432a440f7406c11f25494bebb40a Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@google.com>
Date: Sat, 26 Oct 2024 00:16:05 +0000
Subject: [PATCH 097/161] [c10/util] Add explicit include of <mutex> to
 c10/util/env.cpp (#138854)

Add explicit include of `<mutex>` to `c10/util/env.cpp` since it has usages of `std::lock_guard` which is defined in the header `<mutex>`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138854
Approved by: https://github.com/cyyever, https://github.com/Skylion007
---
 c10/util/env.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c10/util/env.cpp b/c10/util/env.cpp
index c3d7e38f6ea6f..dcc969ac381ba 100644
--- a/c10/util/env.cpp
+++ b/c10/util/env.cpp
@@ -2,6 +2,7 @@
 #include <c10/util/env.h>
 #include <fmt/format.h>
 #include <cstdlib>
+#include <mutex>
 #include <shared_mutex>
 
 namespace c10::utils {

From a3de067975a61f11ccb8580416ea4d4a838cdb24 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Thu, 24 Oct 2024 09:54:30 -0700
Subject: [PATCH 098/161] [PyTorch] Use 128-bit vectors for ARM64 (#137426)

The correct vector length for ARM64 is 128 bits (16
bytes). We were previously using double this, apparently just because
that would be the same length as AVX2.

Differential Revision: [D63984039](https://our.internmc.facebook.com/intern/diff/D63984039/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137426
Approved by: https://github.com/jgong5, https://github.com/malfet
ghstack dependencies: #138486, #138542, #138655, #138716, #138744
---
 aten/src/ATen/CMakeLists.txt                  |   2 +-
 aten/src/ATen/cpu/vec/functional_base.h       |  15 +-
 aten/src/ATen/cpu/vec/vec.h                   |   1 +
 aten/src/ATen/cpu/vec/vec128/vec128.h         |   9 +
 .../ATen/cpu/vec/vec128/vec128_float_neon.h   | 590 ++++++++++++
 .../vec128_half_neon.h}                       | 314 ++----
 aten/src/ATen/cpu/vec/vec256/vec256.h         |   3 -
 .../src/ATen/cpu/vec/vec256/vec256_bfloat16.h |  22 +-
 aten/src/ATen/cpu/vec/vec256/vec256_convert.h |  50 +-
 aten/src/ATen/cpu/vec/vec256/vec256_float.h   |   2 +
 .../ATen/cpu/vec/vec256/vec256_float_neon.h   | 909 ------------------
 aten/src/ATen/cpu/vec/vec256/vec256_qint.h    | 171 ++--
 aten/src/ATen/cpu/vec/vec512/vec512_float.h   |   3 +
 aten/src/ATen/cpu/vec/vec_base.h              |  49 +-
 aten/src/ATen/test/vec_test_all_types.cpp     |   1 +
 setup.py                                      |   1 +
 test/test_mps.py                              |   4 +
 torch/_inductor/codegen/cpp.py                |   7 +-
 torch/_inductor/cpu_vec_isa.py                |   4 +-
 19 files changed, 844 insertions(+), 1313 deletions(-)
 create mode 100644 aten/src/ATen/cpu/vec/vec128/vec128.h
 create mode 100644 aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
 rename aten/src/ATen/cpu/vec/{vec256/vec256_half_neon.h => vec128/vec128_half_neon.h} (61%)
 delete mode 100644 aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index bcfaff434c1bd..a0a845eed6562 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
 
-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec128/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h
index f599161e8daea..4d1d05ea8d326 100644
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@@ -85,24 +85,19 @@ struct VecReduceAllSIMD<float, Op> {
     using Vec = Vectorized<float>;
     Vec v = acc_vec;
 
-    // 128-bit shuffle: [a1, a2, a3, a4, a5, a6, a7, a8] -> [a5, a6, a7, a8, a1, a2, a3, a4]
-    Vec v1 = {v.get_high(), v.get_low()};
-    // [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] ('+' stands for the reduction function. Note that the last 4 elements are not required)
-    v = vec_fun(v, v1);
-
     // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, a4+a8, a1+a5, a2+a6, -, -, -, -]
-    float32x4_t v1_1 = vextq_f32(v.get_low(), v.get_low(), 2);
-    v1 = {v1_1, v1_1};
+    float32x4_t v1_1 = vextq_f32(v, v, 2);
+    Vec v1 = v1_1;
     // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -]
     v = vec_fun(v, v1);
 
     // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, -]
-    v1_1 = vrev64q_f32(v.get_low());
-    v1 = {v1_1, v1_1};
+    v1_1 = vrev64q_f32(v);
+    v1 = v1_1;
     // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -]
     v = vec_fun(v, v1);
 
-    return v.get_low()[0];
+    return v[0];
   }
 };
 #endif // defined(__aarch64__)
diff --git a/aten/src/ATen/cpu/vec/vec.h b/aten/src/ATen/cpu/vec/vec.h
index 234431068a40b..e4b0c4b95d845 100644
--- a/aten/src/ATen/cpu/vec/vec.h
+++ b/aten/src/ATen/cpu/vec/vec.h
@@ -3,6 +3,7 @@
 #if defined(CPU_CAPABILITY_AVX512)
 #include <ATen/cpu/vec/vec512/vec512.h>
 #else
+#include <ATen/cpu/vec/vec128/vec128.h>
 #include <ATen/cpu/vec/vec256/vec256.h>
 #endif
 
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128.h b/aten/src/ATen/cpu/vec/vec128/vec128.h
new file mode 100644
index 0000000000000..0d0108a1f6e1f
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@@ -0,0 +1,9 @@
+#pragma once
+// ARM NEON uses 128-bit vector registers.
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#if !defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+#endif
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
new file mode 100644
index 0000000000000..7476159221178
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@@ -0,0 +1,590 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
+// Sleef offers vectorized versions of some transcedentals
+// such as sin, cos, tan etc..
+// However for now opting for STL, since we are not building
+// with Sleef for mobile yet.
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Right now contains only aarch64 implementation.
+// Due to follow two reasons aarch32 is not currently supported.
+// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics
+//    that work for aarch64 dont work for aarch32.
+// 2. Android NDK r21 has problems with compiling aarch32.
+//    Clang seg faults.
+//    https://github.com/android/ndk/issues/1248
+//    https://bugs.llvm.org/show_bug.cgi?id=45824
+// Most likely we will do aarch32 support with inline asm.
+#if defined(__aarch64__)
+
+#ifdef __BIG_ENDIAN__
+#error "Big endian is not supported."
+#endif
+
+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+template<int index, bool mask_val>
+struct BlendRegs {
+  static float32x4_t impl(
+    const float32x4_t& a, const float32x4_t& b, float32x4_t& res);
+};
+
+template<int index>
+struct BlendRegs<index, true>{
+  static float32x4_t impl(
+      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
+  }
+};
+
+template<int index>
+struct BlendRegs<index, false>{
+  static float32x4_t impl(
+      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
+  }
+};
+
+template <> class Vectorized<float> {
+private:
+  float32x4_t values;
+public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  Vectorized(float32x4_t v) : values(v) {}
+  Vectorized(float val) : values{vdupq_n_f32(val)} {}
+  Vectorized(float val0, float val1, float val2, float val3) :
+         values{val0, val1, val2, val3} {}
+  Vectorized(float (&arr)[4]) : Vectorized(arr[0], arr[1], arr[2], arr[3]) {}
+  operator float32x4_t() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    Vectorized<float> vec;
+    vec.values =
+      BlendRegs<0, (mask & 0x01)!=0>::impl(
+          a.values, b.values, vec.values);
+    vec.values =
+      BlendRegs<1, (mask & 0x02)!=0>::impl(
+          a.values, b.values, vec.values);
+    vec.values =
+      BlendRegs<2, (mask & 0x04)!=0>::impl(
+          a.values, b.values, vec.values);
+    vec.values =
+      BlendRegs<3, (mask & 0x08)!=0>::impl(
+          a.values, b.values, vec.values);
+    return vec;
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask) {
+    // TODO
+    // NB: This requires that each value, i.e., each uint value,
+    // of the mask either all be zeros or all be 1s.
+    // We perhaps need some kind of an assert?
+    // But that will affect performance.
+    Vectorized<float> vec(mask.values);
+    vec.values = vbslq_f32(
+        vreinterpretq_u32_f32(vec.values),
+        b.values,
+        a.values);
+    return vec;
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    const Vectorized<float> base_vec(base);
+    const Vectorized<float> step_vec(step);
+    const Vectorized<float> step_sizes(0, 1, 2, 3);
+    return fmadd(step_sizes, step_vec, base_vec);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0};
+          vec.values = vreinterpretq_f32_u32(mask_low);
+          vec.values = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values),
+              b.values,
+              a.values);
+          return vec;
+        }
+      case 2:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
+          vec.values = vreinterpretq_f32_u32(mask_low);
+          vec.values = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values),
+              b.values,
+              a.values);
+          return vec;
+        }
+      case 3:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+          vec.values = vreinterpretq_f32_u32(mask_low);
+          vec.values = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values),
+              b.values,
+              a.values);
+          return vec;
+        }
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size()) {
+      return vld1q_f32(reinterpret_cast<const float*>(ptr));
+    } else {
+      __at_align__ float tmp_values[size()];
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0.0;
+      }
+      std::memcpy(
+          tmp_values,
+          reinterpret_cast<const float*>(ptr),
+          count * sizeof(float));
+      return vld1q_f32(reinterpret_cast<const float*>(tmp_values));
+    }
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      vst1q_f32(reinterpret_cast<float*>(ptr), values);
+    } else {
+      float tmp_values[size()];
+      vst1q_f32(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  // Very slow implementation of indexing.
+  // Only required because vec256_qint refers to this.
+  // Once we specialize that implementation for ARM
+  // this should be removed. TODO (kimishpatel)
+  float operator[](int idx) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  float operator[](int idx) {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  // For boolean version where we want to if any 1/all zero
+  // etc. can be done faster in a different way.
+  int zero_mask() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    int mask = 0;
+    for (int i = 0; i < size(); ++ i) {
+      if (tmp[i] == 0.f) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+  }
+  Vectorized<float> isnan() const {
+    __at_align__ float tmp[size()];
+    __at_align__ float res[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i])) {
+        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
+      } else {
+        std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
+      }
+    }
+    return loadu(res);
+  };
+  bool has_inf_nan() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if(_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> map2(
+      const Vectorized<float>& second,
+      float (*const f)(float, float)) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_second[size()];
+    store(tmp);
+    second.store(tmp_second);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i], tmp_second[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return Vectorized<float>(vabsq_f32(values));
+  }
+  Vectorized<float> angle() const {
+    auto zero = Vectorized<float>(0);
+    auto pi = Vectorized<float>(c10::pi<float>);
+    auto tmp = blendv(zero, pi, *this < zero);
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \
+  Vectorized<float> name() const {                                      \
+    return USE_SLEEF(                                                   \
+        Vectorized<float>(sleef_name(values)),                          \
+        map(std::name)                                                  \
+    );                                                                  \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(name)    \
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, Sleef_##name##f4_u10)
+
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acos)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(acosh)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(asin)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atan)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(atanh)
+
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, sleef_name) \
+  Vectorized<float> name(const Vectorized<float> &arg) const {          \
+    return USE_SLEEF(                                                   \
+        Vectorized<float>(sleef_name(values, arg.values)),              \
+        map2(arg, std::name)                                            \
+    );                                                                  \
+  }
+
+#define DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(name)           \
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(name, Sleef_##name##f4_u10)
+
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(atan2)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(copysign, Sleef_copysignf4)
+  Vectorized<float> erf() const;
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(erfc, Sleef_erfcf4_u15)
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
+  Vectorized<float> exp_u20() const {
+    return exp();
+  }
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(fmod, Sleef_fmodf4);
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(hypot, Sleef_hypotf4_u05);
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    return map2(x, calc_igamma);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    return map2(x, calc_igammac);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log10)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log1p)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(log2)
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(nextafter, Sleef_nextafterf4)
+  Vectorized<float> frac() const;
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sin)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(sinh)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cos)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(cosh)
+  Vectorized<float> ceil() const {
+    return map(at::native::ceil_impl);
+  }
+  Vectorized<float> floor() const {
+    return map(at::native::floor_impl);
+  }
+  Vectorized<float> neg() const {
+    return Vectorized<float>(
+        vnegq_f32(values));
+  }
+  Vectorized<float> round() const {
+    // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+    return map(at::native::round_impl);
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tan)
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(tanh)
+  Vectorized<float> trunc() const {
+    return Vectorized<float>(vrndq_f32(values));
+  }
+  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(lgamma)
+  Vectorized<float> sqrt() const {
+    return Vectorized<float>(vsqrtq_f32(values));
+  }
+  Vectorized<float> reciprocal() const {
+    return Vectorized<float>(vdivq_f32(vdupq_n_f32(1.0f), values));
+  }
+  Vectorized<float> rsqrt() const {
+    return this->sqrt().reciprocal();
+  }
+  DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC(pow)
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    return Vectorized<float>(vreinterpretq_f32_u32(vceqq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    float32x4_t r0 = vreinterpretq_f32_u32(
+        vmvnq_u32(vceqq_f32(values, other.values)));
+    return Vectorized<float>(r0);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    return Vectorized<float>(vreinterpretq_f32_u32(vcltq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    return Vectorized<float>(vreinterpretq_f32_u32(vcleq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    return Vectorized<float>(vreinterpretq_f32_u32(vcgtq_f32(values, other.values)));
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    return Vectorized<float>(vreinterpretq_f32_u32(vcgeq_f32(values, other.values)));
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vaddq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vsubq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vmulq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vdivq_f32(a, b));
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+//Added sleef Implementation for Maximum
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b)  {
+  if(!a.has_inf_nan() && !b.has_inf_nan()){
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_fmaxf4(a, b)),
+      Vectorized<float>(vmaxq_f32(a,b)));
+  }
+  else{
+    return Vectorized<float>(vmaxq_f32(a, b));
+  }
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vminq_f32(a, b));
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(vandq_u32(
+      vreinterpretq_u32_f32(a),
+      vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(vorrq_u32(
+      vreinterpretq_u32_f32(a),
+      vreinterpretq_u32_f32(b))));
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>(vreinterpretq_f32_u32(veorq_u32(
+      vreinterpretq_u32_f32(a),
+      vreinterpretq_u32_f32(b))));
+}
+
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, int32_t* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<int32_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
+  }
+#ifndef __msvc_cl__
+#pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return Vectorized<float>(vfmaq_f32(c, a, b));
+}
+
+template <>
+Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return Vectorized<float>(vfmsq_f32(c, a, b));
+}
+
+inline Vectorized<float> Vectorized<float>::erf() const{
+    // constants
+    const Vectorized<float> neg_zero_vec(-0.f);
+    const Vectorized<float> one_vec(1.0f);
+    const Vectorized<float> p(0.3275911f);
+    const Vectorized<float> p1(0.254829592f);
+    const Vectorized<float> p2(-0.284496736f);
+    const Vectorized<float> p3(1.421413741f);
+    const Vectorized<float> p4(-1.453152027f);
+    const Vectorized<float> p5(1.061405429f);
+    // sign(x)
+    auto sign_mask = neg_zero_vec & *this;
+    auto abs_vec = this->abs();
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = fmadd(p, abs_vec, one_vec);
+    auto t = one_vec / tmp0;
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = fmadd(p5, t, p4);
+    auto tmp2 = fmadd(tmp1, t, p3);
+    auto tmp3 = fmadd(tmp2, t, p2);
+    auto r = fmadd(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = (*this) * (*this);
+    auto neg_pow_2 = pow_2 ^ neg_zero_vec;
+    auto tmp4 = neg_pow_2.map(std::exp); // This can be swapped for a faster implementation of exp.
+    auto tmp5 = tmp4 ^ neg_zero_vec;
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = t * tmp5;
+    auto tmp7 = fmadd(tmp6, r, one_vec);
+    return tmp7 ^ sign_mask;
+}
+#undef DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC
+#undef DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC
+#endif /* defined(aarch64) */
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
similarity index 61%
rename from aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
rename to aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
index 0b51972a029b4..c3f45d930fa9a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@@ -4,7 +4,7 @@
 // See Note [Do not compile initializers with AVX]
 
 #include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec256/vec256_float_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/Half.h>
 #include <c10/util/irange.h>
@@ -61,15 +61,15 @@ struct BlendHalfRegs<index, false> {
 template <>
 class Vectorized<c10::Half> {
  private:
-  float16x8x2_t values;
+  float16x8_t values;
 
  public:
   // value_type should be c10::Half to fit interface with vec_base.h
   using value_type = c10::Half;
   using size_type = int;
   static constexpr size_type size() {
-    static_assert(sizeof(float16x8x2_t) == 16 * sizeof(value_type));
-    return 16;
+    static_assert(sizeof(float16x8_t) == 8 * sizeof(value_type));
+    return 8;
   }
 
  private:
@@ -89,69 +89,43 @@ class Vectorized<c10::Half> {
 
   Vectorized<c10::Half> map_with_vec_float_method(
       Vectorized<float> (Vectorized<float>::*m)() const) const {
-    // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert
-    // back
-    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0]));
-    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0]));
-    Vectorized<float> mv0 = (Vectorized<float>(v00, v01).*m)();
-    float16x4_t r00 = vcvt_f16_f32(mv0.get_low());
-    float16x4_t r01 = vcvt_f16_f32(mv0.get_high());
-
-    // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert
-    // back
-    float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1]));
-    float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1]));
-    Vectorized<float> mv1 = (Vectorized<float>(v10, v11).*m)();
-    float16x4_t r10 = vcvt_f16_f32(mv1.get_low());
-    float16x4_t r11 = vcvt_f16_f32(mv1.get_high());
-
-    // Pack result into Vectorized<c10::Half>
-    return Vectorized<c10::Half>(
-        vcombine_f16(r00, r01), vcombine_f16(r10, r11));
+    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values));
+    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values));
+    Vectorized<float> mv0 = (Vectorized<float>(v00).*m)();
+    Vectorized<float> mv1 = (Vectorized<float>(v01).*m)();
+    float16x4_t r00 = vcvt_f16_f32(mv0);
+    float16x4_t r01 = vcvt_f16_f32(mv1);
+    return Vectorized<c10::Half>(vcombine_f16(r00, r01));
   }
 
   Vectorized<c10::Half> map2_with_vec_float_method(
       const Vectorized<c10::Half>& second,
       Vectorized<float> (Vectorized<float>::*m)(const Vectorized<float>&)
           const) const {
-    // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert
-    // back
-    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0]));
-    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0]));
-    float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.get_low()));
-    float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.get_low()));
-    Vectorized<float> mv0 = (Vectorized<float>(v00, v01).*m)(
-        Vectorized<float>(second_v00, second_v01));
-    float16x4_t r00 = vcvt_f16_f32(mv0.get_low());
-    float16x4_t r01 = vcvt_f16_f32(mv0.get_high());
-
-    // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert
-    // back
-    float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1]));
-    float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1]));
-    float32x4_t second_v10 = vcvt_f32_f16(vget_low_f16(second.get_high()));
-    float32x4_t second_v11 = vcvt_f32_f16(vget_high_f16(second.get_high()));
-    Vectorized<float> mv1 = (Vectorized<float>(v10, v11).*m)(
-        Vectorized<float>(second_v10, second_v11));
-    float16x4_t r10 = vcvt_f16_f32(mv1.get_low());
-    float16x4_t r11 = vcvt_f16_f32(mv1.get_high());
+    float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values));
+    float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values));
+    float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.values));
+    float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.values));
+    Vectorized<float> mv0 = (Vectorized<float>(v00).*m)(Vectorized<float>(second_v00));
+    Vectorized<float> mv1 = (Vectorized<float>(v01).*m)(Vectorized<float>(second_v01));
+    float16x4_t r00 = vcvt_f16_f32(mv0);
+    float16x4_t r01 = vcvt_f16_f32(mv1);
 
     // Pack result into Vectorized<c10::Half>
-    return Vectorized<c10::Half>(
-        vcombine_f16(r00, r01), vcombine_f16(r10, r11));
+    return Vectorized<c10::Half>(vcombine_f16(r00, r01));
   }
 
  public:
    // constructor
   Vectorized() {}
-  Vectorized(float16x8x2_t v) : values(v) {}
+  Vectorized(float16x8_t v) : values(v) {}
 
   // A ctor that accepts c10::Half is needed to fit interface with vec_base.h
   // A second constructor that takes float16_t is also included
   Vectorized(c10::Half val)
-      : values{vdupq_n_f16((float16_t)val), vdupq_n_f16((float16_t)val)} {
+      : values{vdupq_n_f16((float16_t)val)} {
   }
-  Vectorized(float16_t val) : values{vdupq_n_f16(val), vdupq_n_f16(val)} {}
+  Vectorized(float16_t val) : values{vdupq_n_f16(val)} {}
   Vectorized(
       float16_t val0,
       float16_t val1,
@@ -160,15 +134,7 @@ class Vectorized<c10::Half> {
       float16_t val4,
       float16_t val5,
       float16_t val6,
-      float16_t val7,
-      float16_t val8,
-      float16_t val9,
-      float16_t val10,
-      float16_t val11,
-      float16_t val12,
-      float16_t val13,
-      float16_t val14,
-      float16_t val15)
+      float16_t val7)
       : values{
             val0,
             val1,
@@ -177,17 +143,8 @@ class Vectorized<c10::Half> {
             val4,
             val5,
             val6,
-            val7,
-            val8,
-            val9,
-            val10,
-            val11,
-            val12,
-            val13,
-            val14,
-            val15} {}
-  Vectorized(float16x8_t val0, float16x8_t val1) : values{val0, val1} {}
-  operator float16x8x2_t() const {
+            val7} {}
+  operator float16x8_t() const {
     return values;
   }
   template <int64_t mask>
@@ -196,42 +153,23 @@ class Vectorized<c10::Half> {
       const Vectorized<c10::Half>& b) {
     Vectorized<c10::Half> vec;
     // 0.
-    vec.values.val[0] = BlendHalfRegs<0, (mask & 0x01) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<1, (mask & 0x02) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<2, (mask & 0x04) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<3, (mask & 0x08) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-
-    vec.values.val[0] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl(
-        a.values.val[0], b.values.val[0], vec.values.val[0]);
-
-    // 1.
-    vec.values.val[1] = BlendHalfRegs<0, (mask & 0x10) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<1, (mask & 0x20) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<2, (mask & 0x40) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<3, (mask & 0x80) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-
-    vec.values.val[1] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl(
-        a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values = BlendHalfRegs<0, (mask & 0x01) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<1, (mask & 0x02) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<2, (mask & 0x04) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<3, (mask & 0x08) != 0>::impl(
+        a.values, b.values, vec.values);
+
+    vec.values = BlendHalfRegs<4, (mask & 0x10) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<5, (mask & 0x20) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<6, (mask & 0x40) != 0>::impl(
+        a.values, b.values, vec.values);
+    vec.values = BlendHalfRegs<7, (mask & 0x80) != 0>::impl(
+        a.values, b.values, vec.values);
 
     return vec;
   }
@@ -249,14 +187,10 @@ class Vectorized<c10::Half> {
     // We perhaps need some kind of an assert?
     // But that will affect performance.
     Vectorized<c10::Half> vec(mask.values);
-    vec.values.val[0] = vbslq_f16(
-        vreinterpretq_u16_f16(vec.values.val[0]),
-        b.values.val[0],
-        a.values.val[0]);
-    vec.values.val[1] = vbslq_f16(
-        vreinterpretq_u16_f16(vec.values.val[1]),
-        b.values.val[1],
-        a.values.val[1]);
+    vec.values = vbslq_f16(
+        vreinterpretq_u16_f16(vec.values),
+        b.values,
+        a.values);
     return vec;
   }
   template <typename step_t>
@@ -266,40 +200,32 @@ class Vectorized<c10::Half> {
     const Vectorized<c10::Half> base_vec(base);
     const Vectorized<c10::Half> step_vec(step);
     const Vectorized<c10::Half> step_sizes(
-        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        0, 1, 2, 3, 4, 5, 6, 7);
     return fmadd(step_sizes, step_vec, base_vec);
   }
   static Vectorized<c10::Half> set(
       const Vectorized<c10::Half>& a,
       const Vectorized<c10::Half>& b,
       int64_t count = size()) {
-    uint16_t pre_mask[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    uint16_t pre_mask[size()] = {0};
     for (int i = 0; i < count; i++) {
       pre_mask[i] = 0xFFFF;
     }
-    uint16x8x2_t mask = vld1q_u16_x2(pre_mask);
+    uint16x8_t mask = vld1q_u16(pre_mask);
 
     // Using blendv is awkward because 0xFFFF is one of many NaN's in FP16
     // so we directly use vbslq_f16 instead
     Vectorized<c10::Half> vec(
         vbslq_f16(
-            // Low bits
-            mask.val[0],
-            b.values.val[0],
-            a.values.val[0]),
-        // High bits
-        vbslq_f16(mask.val[1], b.values.val[1], a.values.val[1]));
+            mask,
+            b.values,
+            a.values));
 
     return vec;
   }
   static Vectorized<c10::Half> loadu(const void* ptr, int64_t count = size()) {
     if (count == size()) {
-      return vld1q_f16_x2(reinterpret_cast<const float16_t*>(ptr));
-    } else if (count == (size() >> 1)) {
-      Vectorized<c10::Half> res;
-      res.values.val[0] = vld1q_f16(reinterpret_cast<const float16_t*>(ptr));
-      std::memset(&res.values.val[1], 0, sizeof(res.values.val[1]));
-      return res;
+      return vld1q_f16(reinterpret_cast<const float16_t*>(ptr));
     }
     __at_align__ float16_t tmp_values[size()];
     for (const auto i : c10::irange(size())) {
@@ -309,32 +235,18 @@ class Vectorized<c10::Half> {
         tmp_values,
         reinterpret_cast<const float16_t*>(ptr),
         count * sizeof(float16_t));
-    return vld1q_f16_x2(reinterpret_cast<const float16_t*>(tmp_values));
+    return vld1q_f16(reinterpret_cast<const float16_t*>(tmp_values));
   }
   void store(void* ptr, int64_t count = size()) const {
     if (count == size()) {
-      vst1q_f16_x2(reinterpret_cast<float16_t*>(ptr), values);
+      vst1q_f16(reinterpret_cast<float16_t*>(ptr), values);
       return;
-    } else if (count == (size() >> 1)) {
-      vst1q_f16(reinterpret_cast<float16_t*>(ptr), values.val[0]);
     } else {
       float16_t tmp_values[size()];
-      vst1q_f16_x2(reinterpret_cast<float16_t*>(tmp_values), values);
+      vst1q_f16(reinterpret_cast<float16_t*>(tmp_values), values);
       std::memcpy(ptr, tmp_values, count * sizeof(float16_t));
     }
   }
-  inline const float16x8_t& get_low() const {
-    return values.val[0];
-  }
-  inline float16x8_t& get_low() {
-    return values.val[0];
-  }
-  inline const float16x8_t& get_high() const {
-    return values.val[1];
-  }
-  inline float16x8_t& get_high() {
-    return values.val[1];
-  }
   // Very slow implementation of indexing.
   // Only required because vec256_qint refers to this.
   // Once we specialize that implementation for ARM
@@ -394,8 +306,7 @@ class Vectorized<c10::Half> {
     return loadu(tmp);
   }
   Vectorized<c10::Half> abs() const {
-    return Vectorized<c10::Half>(
-        vabsq_f16(values.val[0]), vabsq_f16(values.val[1]));
+    return Vectorized<c10::Half>(vabsq_f16(values));
   }
   Vectorized<c10::Half> angle() const {
     auto zero = Vectorized<c10::Half>(0);
@@ -518,8 +429,7 @@ class Vectorized<c10::Half> {
     return map(at::native::floor_impl);
   }
   Vectorized<c10::Half> neg() const {
-    return Vectorized<c10::Half>(
-        vnegq_f16(values.val[0]), vnegq_f16(values.val[1]));
+    return Vectorized<c10::Half>(vnegq_f16(values));
   }
   inline Vectorized<c10::Half> round() const {
     // This function is questionable with a conversion, so we use map
@@ -532,22 +442,17 @@ class Vectorized<c10::Half> {
     return map_with_vec_float_method(&Vectorized<float>::tanh);
   }
   Vectorized<c10::Half> trunc() const {
-    float16x8_t r0 = vrndq_f16(values.val[0]);
-    float16x8_t r1 = vrndq_f16(values.val[1]);
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vrndq_f16(values));
   }
   Vectorized<c10::Half> lgamma() const {
     return map_with_vec_float_method(&Vectorized<float>::lgamma);
   }
   Vectorized<c10::Half> sqrt() const {
-    return Vectorized<c10::Half>(
-        vsqrtq_f16(values.val[0]), vsqrtq_f16(values.val[1]));
+    return Vectorized<c10::Half>(vsqrtq_f16(values));
   }
   Vectorized<c10::Half> reciprocal() const {
     auto ones = vdupq_n_f16(1.0f);
-    auto r0 = vdivq_f16(ones, values.val[0]);
-    auto r1 = vdivq_f16(ones, values.val[1]);
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vdivq_f16(ones, values));
   }
   Vectorized<c10::Half> rsqrt() const {
     return this->sqrt().reciprocal();
@@ -556,51 +461,28 @@ class Vectorized<c10::Half> {
     return map2_with_vec_float_method(exp, &Vectorized<float>::pow);
   }
   Vectorized<c10::Half> operator==(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 =
-        vreinterpretq_f16_u16(vceqq_f16(values.val[0], other.values.val[0]));
-    float16x8_t r1 =
-        vreinterpretq_f16_u16(vceqq_f16(values.val[1], other.values.val[1]));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(vceqq_f16(values, other.values)));
   }
 
   Vectorized<c10::Half> operator!=(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 = vreinterpretq_f16_u16(
-        vmvnq_u16(vceqq_f16(values.val[0], other.values.val[0])));
-    float16x8_t r1 = vreinterpretq_f16_u16(
-        vmvnq_u16(vceqq_f16(values.val[1], other.values.val[1])));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(
+                                     vmvnq_u16(vceqq_f16(values, other.values))));
   }
 
   Vectorized<c10::Half> operator<(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 =
-        vreinterpretq_f16_u16(vcltq_f16(values.val[0], other.values.val[0]));
-    float16x8_t r1 =
-        vreinterpretq_f16_u16(vcltq_f16(values.val[1], other.values.val[1]));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcltq_f16(values, other.values)));
   }
 
   Vectorized<c10::Half> operator<=(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 =
-        vreinterpretq_f16_u16(vcleq_f16(values.val[0], other.values.val[0]));
-    float16x8_t r1 =
-        vreinterpretq_f16_u16(vcleq_f16(values.val[1], other.values.val[1]));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcleq_f16(values, other.values)));
   }
 
   Vectorized<c10::Half> operator>(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 =
-        vreinterpretq_f16_u16(vcgtq_f16(values.val[0], other.values.val[0]));
-    float16x8_t r1 =
-        vreinterpretq_f16_u16(vcgtq_f16(values.val[1], other.values.val[1]));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcgtq_f16(values, other.values)));
   }
 
   Vectorized<c10::Half> operator>=(const Vectorized<c10::Half>& other) const {
-    float16x8_t r0 =
-        vreinterpretq_f16_u16(vcgeq_f16(values.val[0], other.values.val[0]));
-    float16x8_t r1 =
-        vreinterpretq_f16_u16(vcgeq_f16(values.val[1], other.values.val[1]));
-    return Vectorized<c10::Half>(r0, r1);
+    return Vectorized<c10::Half>(vreinterpretq_f16_u16(vcgeq_f16(values, other.values)));
   }
 
   Vectorized<c10::Half> eq(const Vectorized<c10::Half>& other) const;
@@ -615,36 +497,28 @@ template <>
 Vectorized<c10::Half> inline operator+(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vaddq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vaddq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vaddq_f16(a, b));
 }
 
 template <>
 Vectorized<c10::Half> inline operator-(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vsubq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vsubq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vsubq_f16(a, b));
 }
 
 template <>
 Vectorized<c10::Half> inline operator*(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vmulq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vmulq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vmulq_f16(a, b));
 }
 
 template <>
 Vectorized<c10::Half> inline operator/(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vdivq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vdivq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vdivq_f16(a, b));
 }
 
 // frac. Implement this here so we can use subtraction
@@ -658,9 +532,7 @@ template <>
 Vectorized<c10::Half> inline maximum(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vmaxq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vmaxq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vmaxq_f16(a, b));
 }
 
 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
@@ -669,9 +541,7 @@ template <>
 Vectorized<c10::Half> inline minimum(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vminq_f16(a.get_low(), b.get_low());
-  float16x8_t r1 = vminq_f16(a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vminq_f16(a, b));
 }
 
 template <>
@@ -700,36 +570,24 @@ template <>
 Vectorized<c10::Half> inline operator&(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vreinterpretq_f16_u16(vandq_u16(
-      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
-  float16x8_t r1 = vreinterpretq_f16_u16(vandq_u16(
-      vreinterpretq_u16_f16(a.get_high()),
-      vreinterpretq_u16_f16(b.get_high())));
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(vandq_u16(
+      vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
 }
 
 template <>
 Vectorized<c10::Half> inline operator|(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vreinterpretq_f16_u16(vorrq_u16(
-      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
-  float16x8_t r1 = vreinterpretq_f16_u16(vorrq_u16(
-      vreinterpretq_u16_f16(a.get_high()),
-      vreinterpretq_u16_f16(b.get_high())));
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(vorrq_u16(
+      vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
 }
 
 template <>
 Vectorized<c10::Half> inline operator^(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b) {
-  float16x8_t r0 = vreinterpretq_f16_u16(veorq_u16(
-      vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low())));
-  float16x8_t r1 = vreinterpretq_f16_u16(veorq_u16(
-      vreinterpretq_u16_f16(a.get_high()),
-      vreinterpretq_u16_f16(b.get_high())));
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vreinterpretq_f16_u16(veorq_u16(
+      vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b))));
 }
 
 inline Vectorized<c10::Half> Vectorized<c10::Half>::eq(
@@ -771,7 +629,6 @@ inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
   for (i = 0; i <= (n - Vectorized<c10::Half>::size());
        i += Vectorized<c10::Half>::size()) {
     vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
-    vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8)));
   }
 #ifndef __msvc_cl__
 #pragma unroll
@@ -790,7 +647,6 @@ inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
   for (i = 0; i <= (n - Vectorized<c10::Half>::size());
        i += Vectorized<c10::Half>::size()) {
     vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
-    vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8)));
   }
 #ifndef __msvc_cl__
 #pragma unroll
@@ -805,9 +661,7 @@ Vectorized<c10::Half> inline fmadd(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b,
     const Vectorized<c10::Half>& c) {
-  float16x8_t r0 = vfmaq_f16(c.get_low(), a.get_low(), b.get_low());
-  float16x8_t r1 = vfmaq_f16(c.get_high(), a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vfmaq_f16(c, a, b));
 }
 
 template <>
@@ -815,9 +669,7 @@ Vectorized<c10::Half> inline fmsub(
     const Vectorized<c10::Half>& a,
     const Vectorized<c10::Half>& b,
     const Vectorized<c10::Half>& c) {
-  float16x8_t r0 = vfmsq_f16(c.get_low(), a.get_low(), b.get_low());
-  float16x8_t r1 = vfmsq_f16(c.get_high(), a.get_high(), b.get_high());
-  return Vectorized<c10::Half>(r0, r1);
+  return Vectorized<c10::Half>(vfmsq_f16(c, a, b));
 }
 
 #endif /* defined(aarch64) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(C10_MOBILE) */
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 68367b81bd8a0..f88e852303912 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -9,9 +9,6 @@
 #if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
 #if defined(CPU_CAPABILITY_SVE256)
 #include <ATen/cpu/vec/sve/vec_common_sve.h>
-#else
-#include <ATen/cpu/vec/vec256/vec256_float_neon.h>
-#include <ATen/cpu/vec/vec256/vec256_half_neon.h>
 #endif
 #include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
index 12c11abb748de..ec84c7bfa5356 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -1101,35 +1101,27 @@ CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
   static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-  float16x8x2_t arr = a;
-  float16x8_t x = arr.val[0];
-  float16x8_t y = arr.val[1];
+  float16x8_t x = a;
 #else
   auto arr = reinterpret_cast<const float16_t*>(a.operator const Half*());
   float16x8_t x = vld1q_f16(arr);
-  float16x8_t y = vld1q_f16(arr + Vectorized<float>::size());
 #endif
   float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x));
   float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x));
-  float32x4_t y1 = vcvt_f32_f16(vget_low_f16(y));
-  float32x4_t y2 = vcvt_f32_f16(vget_high_f16(y));
-  return { Vectorized<float>(x1, x2), Vectorized<float>(y1, y2) };
+  return { Vectorized<float>(x1), Vectorized<float>(x2) };
 }
 inline Vectorized<Half> convert_float_half(const Vectorized<float>& a, const Vectorized<float>& b) {
   static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
-  float32x4x2_t x = a;
-  float32x4x2_t y = b;
-  float16x4_t x1 = vcvt_f16_f32(x.val[0]);
-  float16x4_t x2 = vcvt_f16_f32(x.val[1]);
-  float16x4_t y1 = vcvt_f16_f32(y.val[0]);
-  float16x4_t y2 = vcvt_f16_f32(y.val[1]);
+  float32x4_t x = a;
+  float32x4_t y = b;
+  float16x4_t x1 = vcvt_f16_f32(x);
+  float16x4_t x2 = vcvt_f16_f32(y);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-  return Vectorized<Half>(vcombine_f16(x1, x2), vcombine_f16(y1, y2));
+  return Vectorized<Half>(vcombine_f16(x1, x2));
 #else
   Vectorized<Half> rc;
   auto arr = reinterpret_cast<float16_t*>(rc.operator Half*());
   vst1q_f16(arr, vcombine_f16(x1, x2));
-  vst1q_f16(arr + Vectorized<float>::size(), vcombine_f16(y1, y2));
   return rc;
 #endif
 }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
index d5910f957b6d4..7ae2e8168c74d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@@ -284,7 +284,7 @@ struct VecConvert<
 #endif /* defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) */
 
 
-#if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)) || (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+#if (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER))
 template <typename src_t>
 struct VecConvert<
     float,
@@ -298,19 +298,59 @@ struct VecConvert<
   }
 };
 #endif
+#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+template <typename src_t>
+struct VecConvert<
+    float,
+    1,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 1> apply(const VectorizedN<src_t, 1>& src) {
+    return convert_int8_half_register_to_float(src[0]);
+  }
+};
+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    const auto [v0, v1] = convert_int8_to_float(src[0]);
+    return VectorizedN<float, 2>(v0, v1);
+  }
+};
+#endif
 
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 template <>
-struct VecConvert<float, 1, BFloat16, 1> {
-  static inline VectorizedN<float, 1> apply(
+struct VecConvert<float, 2, BFloat16, 1> {
+  static inline VectorizedN<float, 2> apply(
       const VectorizedN<BFloat16, 1>& src) {
-    VectorizedN<float, 1> result;
+    VectorizedN<float, 2> result;
     uint16x8_t u16_8 = vld1q_u16(reinterpret_cast<const uint16_t*>(&src[0]));
     auto u16_low1 = vget_low_u16(u16_8);
     auto u16_high1 = vget_high_u16(u16_8);
     float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
     float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
-    result[0] = {f32x4_0, f32x4_1};
+    result[0] = f32x4_0;
+    result[1] = f32x4_1;
+    return result;
+  }
+};
+// Half register to full register.
+template <>
+struct VecConvert<float, 1, BFloat16, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<BFloat16, 1>& src) {
+    VectorizedN<float, 1> result;
+    uint16x4_t u16_8 = vld1_u16(reinterpret_cast<const uint16_t*>(&src[0]));
+    float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_8), 16));
+    result[0] = f32x4_0;
     return result;
   }
 };
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
index 16e7a0d9318cc..687dc71ef8691 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@@ -35,6 +35,8 @@ template <> class Vectorized<float> {
          float val5, float val6, float val7, float val8) {
     values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
   }
+  Vectorized(const float (&arr)[8])
+      : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]) {}
   operator __m256() const {
     return values;
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
deleted file mode 100644
index fdf9d66898646..0000000000000
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ /dev/null
@@ -1,909 +0,0 @@
-#pragma once
-
-// DO NOT DEFINE STATIC DATA IN THIS HEADER!
-// See Note [Do not compile initializers with AVX]
-
-#include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec_base.h>
-#include <c10/util/irange.h>
-
-#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
-#include <sleef.h>
-#endif
-
-// Sleef offers vectorized versions of some transcedentals
-// such as sin, cos, tan etc..
-// However for now opting for STL, since we are not building
-// with Sleef for mobile yet.
-
-namespace at::vec {
-// See Note [CPU_CAPABILITY namespace]
-inline namespace CPU_CAPABILITY {
-
-// Right now contains only aarch64 implementation.
-// Due to follow two reasons aarch32 is not currently supported.
-// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics
-//    that work for aarch64 dont work for aarch32.
-// 2. Android NDK r21 has problems with compiling aarch32.
-//    Clang seg faults.
-//    https://github.com/android/ndk/issues/1248
-//    https://bugs.llvm.org/show_bug.cgi?id=45824
-// Most likely we will do aarch32 support with inline asm.
-#if defined(__aarch64__)
-
-#ifdef __BIG_ENDIAN__
-#error "Big endian is not supported."
-#endif
-
-#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
-#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
-#else
-#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
-#endif
-
-template<int index, bool mask_val>
-struct BlendRegs {
-  static float32x4_t impl(
-    const float32x4_t& a, const float32x4_t& b, float32x4_t& res);
-};
-
-template<int index>
-struct BlendRegs<index, true>{
-  static float32x4_t impl(
-      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
-  }
-};
-
-template<int index>
-struct BlendRegs<index, false>{
-  static float32x4_t impl(
-      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
-    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
-  }
-};
-
-template <> class Vectorized<float> {
-private:
-  float32x4x2_t values;
-public:
-  using value_type = float;
-  using size_type = int;
-  static constexpr size_type size() {
-    return 8;
-  }
-  Vectorized() {}
-  Vectorized(float32x4x2_t v) : values(v) {}
-  Vectorized(float val) : values{vdupq_n_f32(val), vdupq_n_f32(val) } {}
-  Vectorized(float val0, float val1, float val2, float val3,
-         float val4, float val5, float val6, float val7) :
-         values{val0, val1, val2, val3, val4, val5, val6, val7} {}
-  Vectorized(float32x4_t val0, float32x4_t val1) : values{val0, val1} {}
-  operator float32x4x2_t() const {
-    return values;
-  }
-  template <int64_t mask>
-  static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
-    Vectorized<float> vec;
-    // 0.
-    vec.values.val[0] =
-      BlendRegs<0, (mask & 0x01)!=0>::impl(
-          a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] =
-      BlendRegs<1, (mask & 0x02)!=0>::impl(
-          a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] =
-      BlendRegs<2, (mask & 0x04)!=0>::impl(
-          a.values.val[0], b.values.val[0], vec.values.val[0]);
-    vec.values.val[0] =
-      BlendRegs<3, (mask & 0x08)!=0>::impl(
-          a.values.val[0], b.values.val[0], vec.values.val[0]);
-    // 1.
-    vec.values.val[1] =
-      BlendRegs<0, (mask & 0x10)!=0>::impl(
-          a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] =
-      BlendRegs<1, (mask & 0x20)!=0>::impl(
-          a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] =
-      BlendRegs<2, (mask & 0x40)!=0>::impl(
-          a.values.val[1], b.values.val[1], vec.values.val[1]);
-    vec.values.val[1] =
-      BlendRegs<3, (mask & 0x80)!=0>::impl(
-          a.values.val[1], b.values.val[1], vec.values.val[1]);
-    return vec;
-  }
-  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
-                              const Vectorized<float>& mask) {
-    // TODO
-    // NB: This requires that each value, i.e., each uint value,
-    // of the mask either all be zeros or all be 1s.
-    // We perhaps need some kind of an assert?
-    // But that will affect performance.
-    Vectorized<float> vec(mask.values);
-    vec.values.val[0] = vbslq_f32(
-        vreinterpretq_u32_f32(vec.values.val[0]),
-        b.values.val[0],
-        a.values.val[0]);
-    vec.values.val[1] = vbslq_f32(
-        vreinterpretq_u32_f32(vec.values.val[1]),
-        b.values.val[1],
-        a.values.val[1]);
-    return vec;
-  }
-  template<typename step_t>
-  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
-    const Vectorized<float> base_vec(base);
-    const Vectorized<float> step_vec(step);
-    const Vectorized<float> step_sizes(0, 1, 2, 3, 4, 5, 6, 7);
-    return fmadd(step_sizes, step_vec, base_vec);
-  }
-  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
-                           int64_t count = size()) {
-    switch (count) {
-      case 0:
-        return a;
-      case 1:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0};
-          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
-          vec.values.val[1] = a.values.val[1];
-          vec.values.val[0] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[0]),
-              b.values.val[0],
-              a.values.val[0]);
-          return vec;
-        }
-      case 2:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
-          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
-          vec.values.val[1] = a.values.val[1];
-          vec.values.val[0] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[0]),
-              b.values.val[0],
-              a.values.val[0]);
-          return vec;
-        }
-      case 3:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
-          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
-          vec.values.val[1] = a.values.val[1];
-          vec.values.val[0] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[0]),
-              b.values.val[0],
-              a.values.val[0]);
-          return vec;
-        }
-      case 4:
-        return Vectorized<float>(b.values.val[0], a.values.val[1]);
-      case 5:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_high = {0xFFFFFFFF, 0x0, 0x0, 0x0};
-          vec.values.val[0] = b.values.val[0];
-          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
-          vec.values.val[1] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[1]),
-              b.values.val[1],
-              a.values.val[1]);
-          return vec;
-        }
-      case 6:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
-          vec.values.val[0] = b.values.val[0];
-          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
-          vec.values.val[1] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[1]),
-              b.values.val[1],
-              a.values.val[1]);
-          return vec;
-        }
-      case 7:
-        {
-          Vectorized<float> vec;
-          static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
-          vec.values.val[0] = b.values.val[0];
-          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
-          vec.values.val[1] = vbslq_f32(
-              vreinterpretq_u32_f32(vec.values.val[1]),
-              b.values.val[1],
-              a.values.val[1]);
-          return vec;
-        }
-    }
-    return b;
-  }
-  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
-    if (count == size()) {
-      return vld1q_f32_x2(reinterpret_cast<const float*>(ptr));
-    }
-    else if (count == (size() >> 1)) {
-      Vectorized<float> res;
-      res.values.val[0] = vld1q_f32(reinterpret_cast<const float*>(ptr));
-      res.values.val[1] = vdupq_n_f32(0.f);
-      return res;
-    }
-    else {
-      __at_align__ float tmp_values[size()];
-      for (const auto i : c10::irange(size())) {
-        tmp_values[i] = 0.0;
-      }
-      std::memcpy(
-          tmp_values,
-          reinterpret_cast<const float*>(ptr),
-          count * sizeof(float));
-      return vld1q_f32_x2(reinterpret_cast<const float*>(tmp_values));
-    }
-  }
-  void store(void* ptr, int64_t count = size()) const {
-    if (count == size()) {
-      vst1q_f32_x2(reinterpret_cast<float*>(ptr), values);
-    }
-    else if (count == (size() >> 1)) {
-      vst1q_f32(reinterpret_cast<float*>(ptr), values.val[0]);
-    }
-    else {
-      float tmp_values[size()];
-      vst1q_f32_x2(reinterpret_cast<float*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(float));
-    }
-  }
-  inline const float32x4_t& get_low() const {
-    return values.val[0];
-  }
-  inline float32x4_t& get_low() {
-    return values.val[0];
-  }
-  inline const float32x4_t& get_high() const {
-    return values.val[1];
-  }
-  inline float32x4_t& get_high() {
-    return values.val[1];
-  }
-  // Very slow implementation of indexing.
-  // Only required because vec256_qint refers to this.
-  // Once we specialize that implementation for ARM
-  // this should be removed. TODO (kimishpatel)
-  float operator[](int idx) const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    return tmp[idx];
-  }
-  float operator[](int idx) {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    return tmp[idx];
-  }
-  // For boolean version where we want to if any 1/all zero
-  // etc. can be done faster in a different way.
-  int zero_mask() const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    int mask = 0;
-    for (int i = 0; i < size(); ++ i) {
-      if (tmp[i] == 0.f) {
-        mask |= (1 << i);
-      }
-    }
-    return mask;
-  }
-  Vectorized<float> isnan() const {
-    __at_align__ float tmp[size()];
-    __at_align__ float res[size()];
-    store(tmp);
-    for (const auto i : c10::irange(size())) {
-      if (_isnan(tmp[i])) {
-        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
-      } else {
-        std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
-      }
-    }
-    return loadu(res);
-  };
-  bool has_inf_nan() const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    for (const auto i : c10::irange(size())) {
-      if(_isnan(tmp[i]) || _isinf(tmp[i])) {
-        return true;
-      }
-    }
-    return false;
-  }
-  Vectorized<float> map(float (*const f)(float)) const {
-    __at_align__ float tmp[size()];
-    store(tmp);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = f(tmp[i]);
-    }
-    return loadu(tmp);
-  }
-  Vectorized<float> abs() const {
-    return Vectorized<float>(vabsq_f32(values.val[0]), vabsq_f32(values.val[1]));
-  }
-  Vectorized<float> angle() const {
-    auto zero = Vectorized<float>(0);
-    auto pi = Vectorized<float>(c10::pi<float>);
-    auto tmp = blendv(zero, pi, *this < zero);
-    return blendv(tmp, *this, isnan());
-  }
-  Vectorized<float> real() const {
-    return *this;
-  }
-  Vectorized<float> imag() const {
-    return Vectorized<float>(0.f);
-  }
-  Vectorized<float> conj() const {
-    return *this;
-  }
-  Vectorized<float> acos() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])),
-      map(std::acos)
-    );
-  }
-  Vectorized<float> acosh() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_acoshf4_u10(values.val[0]), Sleef_acoshf4_u10(values.val[1])),
-      map(std::acosh)
-    );
-  }
-  Vectorized<float> asin() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])),
-      map(std::asin)
-    );
-  }
-  Vectorized<float> atan() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])),
-      map(std::atan)
-    );
-  }
-  Vectorized<float> atanh() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_atanhf4_u10(values.val[0]), Sleef_atanhf4_u10(values.val[1])),
-      map(std::atanh)
-    );
-  }
-  Vectorized<float> atan2(const Vectorized<float> &exp) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]),
-                                 Sleef_atan2f4_u10(values.val[1], exp.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_exp[size()];
-        store(tmp);
-        exp.store(tmp_exp);
-        for (const auto i : c10::irange(size())) {
-          tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> copysign(const Vectorized<float> &sign) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_copysignf4(values.val[0], sign.values.val[0]),
-                                 Sleef_copysignf4(values.val[1], sign.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_sign[size()];
-        store(tmp);
-        sign.store(tmp_sign);
-        for (size_type i = 0; i < size(); i++) {
-          tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> erf() const;
-  Vectorized<float> erfc() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])),
-      map(std::erfc)
-    );
-  }
-  Vectorized<float> erfinv() const {
-    return map(calc_erfinv);
-  }
-  Vectorized<float> exp() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])),
-      map(std::exp)
-    );
-  }
-  Vectorized<float> exp2() const {
-    return USE_SLEEF(
-        Vectorized<float>(Sleef_exp2f4_u10(values.val[0]), Sleef_exp2f4_u10(values.val[1])),
-        map(std::exp2)
-      );
-  }
-  Vectorized<float> expm1() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])),
-      map(std::expm1)
-    );
-  }
-  Vectorized<float> exp_u20() const {
-    return exp();
-  }
-  Vectorized<float> fmod(const Vectorized<float>& q) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_fmodf4(values.val[0], q.values.val[0]),
-                                 Sleef_fmodf4(values.val[1], q.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_q[size()];
-        store(tmp);
-        q.store(tmp_q);
-        for (const auto i : c10::irange(size())) {
-          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> hypot(const Vectorized<float> &b) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_hypotf4_u05(values.val[0], b.values.val[0]),
-                                 Sleef_hypotf4_u05(values.val[1], b.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (const auto i : c10::irange(size())) {
-          tmp[i] = std::hypot(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> i0() const {
-    return map(calc_i0);
-  }
-  Vectorized<float> i0e() const {
-    return map(calc_i0e);
-  }
-  Vectorized<float> digamma() const {
-    return map(calc_digamma);
-  }
-  Vectorized<float> igamma(const Vectorized<float> &x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
-  }
-  Vectorized<float> igammac(const Vectorized<float> &x) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_x[size()];
-    store(tmp);
-    x.store(tmp_x);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
-    }
-    return loadu(tmp);
-  }
-  Vectorized<float> log() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])),
-      map(std::log)
-    );
-  }
-  Vectorized<float> log10() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])),
-      map(std::log10)
-    );
-  }
-  Vectorized<float> log1p() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])),
-      map(std::log1p)
-    );
-  }
-  Vectorized<float> log2() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])),
-      map(std::log2)
-    );
-  }
-  Vectorized<float> nextafter(const Vectorized<float> &b) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_nextafterf4(values.val[0], b.values.val[0]),
-                                 Sleef_nextafterf4(values.val[1], b.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_b[size()];
-        store(tmp);
-        b.store(tmp_b);
-        for (const auto i : c10::irange(size())) {
-          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> frac() const;
-  Vectorized<float> sin() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])),
-      map(std::sin)
-    );
-  }
-  Vectorized<float> sinh() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])),
-      map(std::sinh)
-    );
-  }
-  Vectorized<float> cos() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])),
-      map(std::cos)
-    );
-  }
-  Vectorized<float> cosh() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])),
-      map(std::cosh)
-    );
-  }
-  Vectorized<float> ceil() const {
-    return map(at::native::ceil_impl);
-  }
-  Vectorized<float> floor() const {
-    return map(at::native::floor_impl);
-  }
-  Vectorized<float> neg() const {
-    return Vectorized<float>(
-        vnegq_f32(values.val[0]),
-        vnegq_f32(values.val[1]));
-  }
-  Vectorized<float> round() const {
-    // We do not use std::round because we would like to round midway numbers to the nearest even integer.
-    return map(at::native::round_impl);
-  }
-  Vectorized<float> tan() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])),
-      map(std::tan)
-    );
-  }
-  Vectorized<float> tanh() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])),
-      map(std::tanh)
-    );
-  }
-  Vectorized<float> trunc() const {
-    float32x4_t r0 = vrndq_f32(values.val[0]);
-    float32x4_t r1 = vrndq_f32(values.val[1]);
-    return Vectorized<float>(r0, r1);
-  }
-  Vectorized<float> lgamma() const {
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])),
-      map(std::lgamma)
-    );
-  }
-  Vectorized<float> sqrt() const {
-    return Vectorized<float>(
-        vsqrtq_f32(values.val[0]),
-        vsqrtq_f32(values.val[1]));
-  }
-  Vectorized<float> reciprocal() const {
-    auto r0 = vdivq_f32(vdupq_n_f32(1.0f), values.val[0]);
-    auto r1 = vdivq_f32(vdupq_n_f32(1.0f), values.val[1]);
-    return Vectorized<float>(r0, r1);
-  }
-  Vectorized<float> rsqrt() const {
-    return this->sqrt().reciprocal();
-  }
-  Vectorized<float> pow(const Vectorized<float> &exp) const {
-    USE_SLEEF(
-      {
-        return Vectorized<float>(Sleef_powf4_u10(values.val[0], exp.values.val[0]),
-                                 Sleef_powf4_u10(values.val[1], exp.values.val[1]));
-      },
-      {
-        __at_align__ float tmp[size()];
-        __at_align__ float tmp_exp[size()];
-        store(tmp);
-        exp.store(tmp_exp);
-        for (const auto i : c10::irange(size())) {
-          tmp[i] = std::pow(tmp[i], tmp_exp[i]);
-        }
-        return loadu(tmp);
-      }
-    )
-  }
-  Vectorized<float> operator==(const Vectorized<float>& other) const {
-    float32x4_t r0 =
-      vreinterpretq_f32_u32(vceqq_f32(values.val[0], other.values.val[0]));
-    float32x4_t r1 =
-      vreinterpretq_f32_u32(vceqq_f32(values.val[1], other.values.val[1]));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> operator!=(const Vectorized<float>& other) const {
-    float32x4_t r0 = vreinterpretq_f32_u32(
-        vmvnq_u32(vceqq_f32(values.val[0], other.values.val[0])));
-    float32x4_t r1 = vreinterpretq_f32_u32(
-        vmvnq_u32(vceqq_f32(values.val[1], other.values.val[1])));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> operator<(const Vectorized<float>& other) const {
-    float32x4_t r0 =
-      vreinterpretq_f32_u32(vcltq_f32(values.val[0], other.values.val[0]));
-    float32x4_t r1 =
-      vreinterpretq_f32_u32(vcltq_f32(values.val[1], other.values.val[1]));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> operator<=(const Vectorized<float>& other) const {
-    float32x4_t r0 =
-      vreinterpretq_f32_u32(vcleq_f32(values.val[0], other.values.val[0]));
-    float32x4_t r1 =
-      vreinterpretq_f32_u32(vcleq_f32(values.val[1], other.values.val[1]));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> operator>(const Vectorized<float>& other) const {
-    float32x4_t r0 =
-      vreinterpretq_f32_u32(vcgtq_f32(values.val[0], other.values.val[0]));
-    float32x4_t r1 =
-      vreinterpretq_f32_u32(vcgtq_f32(values.val[1], other.values.val[1]));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> operator>=(const Vectorized<float>& other) const {
-    float32x4_t r0 =
-      vreinterpretq_f32_u32(vcgeq_f32(values.val[0], other.values.val[0]));
-    float32x4_t r1 =
-      vreinterpretq_f32_u32(vcgeq_f32(values.val[1], other.values.val[1]));
-    return Vectorized<float>(r0, r1);
-  }
-
-  Vectorized<float> eq(const Vectorized<float>& other) const;
-  Vectorized<float> ne(const Vectorized<float>& other) const;
-  Vectorized<float> gt(const Vectorized<float>& other) const;
-  Vectorized<float> ge(const Vectorized<float>& other) const;
-  Vectorized<float> lt(const Vectorized<float>& other) const;
-  Vectorized<float> le(const Vectorized<float>& other) const;
-};
-
-template <>
-Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vaddq_f32(a.get_low(), b.get_low());
-  float32x4_t r1 = vaddq_f32(a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vsubq_f32(a.get_low(), b.get_low());
-  float32x4_t r1 = vsubq_f32(a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vmulq_f32(a.get_low(), b.get_low());
-  float32x4_t r1 = vmulq_f32(a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vdivq_f32(a.get_low(), b.get_low());
-  float32x4_t r1 = vdivq_f32(a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-// frac. Implement this here so we can use subtraction
-inline Vectorized<float> Vectorized<float>::frac() const {
-  return *this - this->trunc();
-}
-
-//Added sleef Implementation for Maximum
-Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b)  {
-  if(!a.has_inf_nan() && !b.has_inf_nan()){
-    return USE_SLEEF(
-      Vectorized<float>(Sleef_fmaxf4(a.get_low(), b.get_low()),Sleef_fmaxf4(a.get_high(), b.get_high())),
-      Vectorized<float>(vmaxq_f32(a.get_low(), b.get_low()),vmaxq_f32(a.get_high(), b.get_high())));
-  }
-  else{
-    return Vectorized<float>(vmaxq_f32(a.get_low(), b.get_low()),vmaxq_f32(a.get_high(), b.get_high()));
-  }
-  }
-
-// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
-// either input is a NaN.
-template <>
-Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vminq_f32(a.get_low(), b.get_low());
-  float32x4_t r1 = vminq_f32(a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
-  return minimum(max, maximum(min, a));
-}
-
-template <>
-Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
-  return minimum(max, a);
-}
-
-template <>
-Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
-  return maximum(min, a);
-}
-
-template <>
-Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vreinterpretq_f32_u32(vandq_u32(
-      vreinterpretq_u32_f32(a.get_low()),
-      vreinterpretq_u32_f32(b.get_low())));
-  float32x4_t r1 = vreinterpretq_f32_u32(vandq_u32(
-      vreinterpretq_u32_f32(a.get_high()),
-      vreinterpretq_u32_f32(b.get_high())));
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(a.get_low()),
-      vreinterpretq_u32_f32(b.get_low())));
-  float32x4_t r1 = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(a.get_high()),
-      vreinterpretq_u32_f32(b.get_high())));
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
-  float32x4_t r0 = vreinterpretq_f32_u32(veorq_u32(
-      vreinterpretq_u32_f32(a.get_low()),
-      vreinterpretq_u32_f32(b.get_low())));
-  float32x4_t r1 = vreinterpretq_f32_u32(veorq_u32(
-      vreinterpretq_u32_f32(a.get_high()),
-      vreinterpretq_u32_f32(b.get_high())));
-  return Vectorized<float>(r0, r1);
-}
-
-inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
-  return (*this == other) & Vectorized<float>(1.0f);
-}
-
-inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
-  return (*this != other) & Vectorized<float>(1.0f);
-}
-
-inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
-  return (*this > other) & Vectorized<float>(1.0f);
-}
-
-inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
-  return (*this >= other) & Vectorized<float>(1.0f);
-}
-
-inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
-  return (*this < other) & Vectorized<float>(1.0f);
-}
-
-inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
-  return (*this <= other) & Vectorized<float>(1.0f);
-}
-
-template <>
-inline void convert(const float* src, int32_t* dst, int64_t n) {
-  int64_t i;
-#ifndef __msvc_cl__
-#pragma unroll
-#endif
-  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
-    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
-    vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4)));
-  }
-#ifndef __msvc_cl__
-#pragma unroll
-#endif
-  for (; i < n; i++) {
-    dst[i] = static_cast<int32_t>(src[i]);
-  }
-}
-
-template <>
-inline void convert(const int32_t* src, float* dst, int64_t n) {
-  int64_t i;
-#ifndef __msvc_cl__
-#pragma unroll
-#endif
-  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
-    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
-    vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4)));
-  }
-#ifndef __msvc_cl__
-#pragma unroll
-#endif
-  for (; i < n; i++) {
-    dst[i] = static_cast<float>(src[i]);
-  }
-}
-
-template <>
-Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
-  float32x4_t r0 = vfmaq_f32(c.get_low(), a.get_low(), b.get_low());
-  float32x4_t r1 = vfmaq_f32(c.get_high(), a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-template <>
-Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
-  float32x4_t r0 = vfmsq_f32(c.get_low(), a.get_low(), b.get_low());
-  float32x4_t r1 = vfmsq_f32(c.get_high(), a.get_high(), b.get_high());
-  return Vectorized<float>(r0, r1);
-}
-
-inline Vectorized<float> Vectorized<float>::erf() const{
-    // constants
-    const Vectorized<float> neg_zero_vec(-0.f);
-    const Vectorized<float> one_vec(1.0f);
-    const Vectorized<float> p(0.3275911f);
-    const Vectorized<float> p1(0.254829592f);
-    const Vectorized<float> p2(-0.284496736f);
-    const Vectorized<float> p3(1.421413741f);
-    const Vectorized<float> p4(-1.453152027f);
-    const Vectorized<float> p5(1.061405429f);
-    // sign(x)
-    auto sign_mask = neg_zero_vec & *this;
-    auto abs_vec = this->abs();
-    // t = 1 / (p * abs(x) + 1)
-    auto tmp0 = fmadd(p, abs_vec, one_vec);
-    auto t = one_vec / tmp0;
-    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
-    auto tmp1 = fmadd(p5, t, p4);
-    auto tmp2 = fmadd(tmp1, t, p3);
-    auto tmp3 = fmadd(tmp2, t, p2);
-    auto r = fmadd(tmp3, t, p1);
-    // - exp(- x * x)
-    auto pow_2 = (*this) * (*this);
-    auto neg_pow_2 = pow_2 ^ neg_zero_vec;
-    auto tmp4 = neg_pow_2.map(std::exp); // This can be swapped for a faster implementation of exp.
-    auto tmp5 = tmp4 ^ neg_zero_vec;
-    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
-    auto tmp6 = t * tmp5;
-    auto tmp7 = fmadd(tmp6, r, one_vec);
-    return tmp7 ^ sign_mask;
-}
-#endif /* defined(aarch64) */
-
-}} // namespace at::vec::CPU_CAPABILITY
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index 5d00a48ab187d..9b900cd0f63ee 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -258,19 +258,21 @@ __FORCE_INLINE void QuantizeAvx2(
 template<>
 struct Vectorized<c10::qint32> : public Vectorizedqi {
     using size_type = int;
+    static constexpr size_type kSize = Vectorized<int>::size();
     static constexpr size_type size() {
-        return 8;
+        return kSize;
     }
 
+    static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
     static constexpr int float_num_vecs() {
-        return 1;
+        return kFloatNumVecs;
     }
 
     static constexpr int int_num_vecs() {
         return 1;
     }
 
-    using float_vec_return_type = std::array<Vectorized<float>, 1>;
+    using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
     using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
     using value_type = c10::qint32::underlying;
 
@@ -334,7 +336,7 @@ struct Vectorized<c10::qint32> : public Vectorizedqi {
       Vectorized<c10::qint32> retval;
       auto rhs_data = (__m256)rhs[0];
       at::native::quantize_vec<c10::qint32, /*precision=*/32>(
-          scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 8);
+          scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, size());
       return retval;
     }
 
@@ -447,20 +449,23 @@ __m256i RequantizeAvx2(
 
 template<>
 struct Vectorized<c10::qint8> : public Vectorizedqi {
+    static constexpr int kSize = VECTOR_WIDTH;
     static constexpr int size() {
-        return 32;
+        return kSize;
     }
 
+    static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
     static constexpr int float_num_vecs() {
-        return 4;
+        return kFloatNumVecs;
     }
 
+    static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
     static constexpr int int_num_vecs() {
-        return 4;
+        return kIntNumVecs;
     }
 
-    using float_vec_return_type = std::array<Vectorized<float>, 4>;
-    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
     using value_type = typename c10::qint8::underlying;
 
  public:
@@ -647,20 +652,23 @@ Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vec
 
 template<>
 struct Vectorized<c10::quint8> : public Vectorizedqi {
+    static constexpr int kSize = VECTOR_WIDTH;
     static constexpr int size() {
-        return 32;
+        return kSize;
     }
 
+    static constexpr int kFloatNumVecs = kSize / Vectorized<float>::size();
     static constexpr int float_num_vecs() {
-        return 4;
+        return kFloatNumVecs;
     }
 
+    static constexpr int kIntNumVecs = kSize / Vectorized<int>::size();
     static constexpr int int_num_vecs() {
-        return 4;
+        return kIntNumVecs;
     }
 
-    using float_vec_return_type = std::array<Vectorized<float>, 4>;
-    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using float_vec_return_type = std::array<Vectorized<float>, kFloatNumVecs>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, kIntNumVecs>;
     using value_type = typename c10::quint8::underlying;
 
  public:
@@ -864,11 +872,11 @@ struct VectorizedQuantizedConverter {
   }
 
   static constexpr int float_num_vecs() {
-    return size() / 8;
+    return size_ / Vectorized<float>::size();
   }
 
   static constexpr int int_num_vecs() {
-    return size() / 8;
+    return size_ / Vectorized<int>::size();
   }
 
   using float_vec_return_type = float_vec_return_type_;
@@ -897,19 +905,12 @@ struct VectorizedQuantizedConverter {
       Vectorized<float> /*scale_zp_premul*/) const {
     float_vec_return_type rv;
     for (const auto i : c10::irange(float_num_vecs())) {
-      float tmp_vals[8];
-      for (const auto j : c10::irange(8)) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (const auto j : c10::irange(Vectorized<float>::size())) {
         tmp_vals[j] = at::native::dequantize_val<T>(
-            scale[j], zero_point[j], T(vals[8 * i + j]));
+            scale[j], zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
       }
-      rv[i] = Vectorized<float>(tmp_vals[0],
-          tmp_vals[1],
-          tmp_vals[2],
-          tmp_vals[3],
-          tmp_vals[4],
-          tmp_vals[5],
-          tmp_vals[6],
-          tmp_vals[7]);
+      rv[i] = Vectorized<float>(tmp_vals);
     }
     return rv;
   }
@@ -930,25 +931,8 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
                                  c10::qint32,
                                  std::array<Vectorized<float>, 1>,
                                  std::array<Vectorized<c10::qint32>, 1>,
-                                 8> {
-  Vectorized()
-      : VectorizedQuantizedConverter<
-            c10::qint32,
-            std::array<Vectorized<float>, 1>,
-            std::array<Vectorized<c10::qint32>, 1>,
-            8>() {}
-  Vectorized(c10::qint32 val)
-      : VectorizedQuantizedConverter<
-            c10::qint32,
-            std::array<Vectorized<float>, 1>,
-            std::array<Vectorized<c10::qint32>, 1>,
-            8>(val) {}
-  Vectorized(const void* ptr)
-      : VectorizedQuantizedConverter<
-            c10::qint32,
-            std::array<Vectorized<float>, 1>,
-            std::array<Vectorized<c10::qint32>, 1>,
-            8>(ptr) {}
+                                 Vectorized<int>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::qint32> loadu(const void* ptr) {
     return Vectorized<c10::qint32>(ptr);
@@ -973,10 +957,10 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
       int32_t zero_point,
       float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * 8> float_vals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (const auto i : c10::irange(float_num_vecs())) {
-      rhs[i].store(&float_vals[i * 8], 8);
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
     }
 
     at::native::quantize_vec<c10::qint32, /*precision=*/32>(
@@ -984,7 +968,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
         zero_point,
         float_vals.data(),
         (c10::qint32*)qvals.data(),
-        8 * float_num_vecs());
+        float_vals.size());
 
     return Vectorized<c10::qint32>::loadu(qvals.data());
   }
@@ -1075,25 +1059,8 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
                                 c10::qint8,
                                 std::array<Vectorized<float>, 4>,
                                 std::array<Vectorized<c10::qint32>, 4>,
-                                32> {
-  Vectorized()
-      : VectorizedQuantizedConverter<
-            c10::qint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>() {}
-  Vectorized(c10::qint8 val)
-      : VectorizedQuantizedConverter<
-            c10::qint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>(val) {}
-  Vectorized(const void* ptr)
-      : VectorizedQuantizedConverter<
-            c10::qint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>(ptr) {}
+                                4 * Vectorized<float>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::qint8> loadu(const void* ptr) {
     return Vectorized<c10::qint8>(ptr);
@@ -1118,10 +1085,10 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
       int32_t zero_point,
       float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * 8> float_vals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (const auto i : c10::irange(float_num_vecs())) {
-      rhs[i].store(&float_vals[i * 8], 8);
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
     }
 
     at::native::quantize_vec<c10::qint8>(
@@ -1129,7 +1096,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
         zero_point,
         float_vals.data(),
         (c10::qint8*)qvals.data(),
-        8 * float_num_vecs());
+        float_vals.size());
 
     return Vectorized<c10::qint8>::loadu(qvals.data());
   }
@@ -1208,25 +1175,8 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
                                  c10::quint8,
                                  std::array<Vectorized<float>, 4>,
                                  std::array<Vectorized<c10::qint32>, 4>,
-                                 32> {
-  Vectorized()
-      : VectorizedQuantizedConverter<
-            c10::quint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>() {}
-  Vectorized(c10::quint8 val)
-      : VectorizedQuantizedConverter<
-            c10::quint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>(val) {}
-  Vectorized(const void* ptr)
-      : VectorizedQuantizedConverter<
-            c10::quint8,
-            std::array<Vectorized<float>, 4>,
-            std::array<Vectorized<c10::qint32>, 4>,
-            32>(ptr) {}
+                                 4 * Vectorized<float>::size()> {
+  using VectorizedQuantizedConverter::VectorizedQuantizedConverter;
 
   static Vectorized<c10::quint8> loadu(const void* ptr) {
     return Vectorized<c10::quint8>(ptr);
@@ -1251,10 +1201,10 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
       int32_t zero_point,
       float /*inverse_scale*/) {
     std::array<value_type, size()> qvals;
-    std::array<float, float_num_vecs() * 8> float_vals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
 
     for (const auto i : c10::irange(float_num_vecs())) {
-      rhs[i].store(&float_vals[i * 8], 8);
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()]);
     }
 
     at::native::quantize_vec<c10::quint8>(
@@ -1262,7 +1212,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
         zero_point,
         float_vals.data(),
         (c10::quint8*)qvals.data(),
-        8 * float_num_vecs());
+        float_vals.size());
 
     return Vectorized<c10::quint8>::loadu(qvals.data());
   }
@@ -1340,29 +1290,44 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
 #endif // if defined(CPU_CAPABILITY_AVX2)
 
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
-template <typename T>
-typename std::enable_if_t<std::is_same_v<T, int8_t>, at::vec::Vectorized<float>>
-inline convert_int8_to_float(at::vec::Vectorized<T> src) {
-  // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+std::pair<Vectorized<float>, Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
     auto s8x8 = vld1_s8(src.operator const int8_t*());
     auto s16x8 = vmovl_s8(s8x8);
 
     auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
     auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
 
-    return Vectorized<float>(vcvtq_f32_s32(s32x4_lo), vcvtq_f32_s32(s32x4_hi));
+    return std::make_pair(Vectorized<float>(vcvtq_f32_s32(s32x4_lo)), Vectorized<float>(vcvtq_f32_s32(s32x4_hi)));
 }
 
-template <typename T>
-typename std::enable_if_t<std::is_same_v<T, uint8_t>, at::vec::Vectorized<float>>
-inline convert_int8_to_float(at::vec::Vectorized<T> src) {
-  // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+std::pair<Vectorized<float>, Vectorized<float>>
+inline convert_int8_to_float(at::vec::Vectorized<uint8_t> src) {
     auto u8x8 = vld1_u8(src.operator const uint8_t*());
     auto u16x8 = vmovl_u8(u8x8);
     auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
     auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
 
-    return Vectorized<float>(vcvtq_f32_u32(u32x4_lo), vcvtq_f32_u32(u32x4_hi));
+    return std::make_pair(Vectorized<float>(vcvtq_f32_u32(u32x4_lo)), Vectorized<float>(vcvtq_f32_u32(u32x4_hi)));
+}
+
+Vectorized<float>
+inline convert_int8_half_register_to_float(at::vec::Vectorized<int8_t> src) {
+    auto s8x8 = vld1_s8(src.operator const int8_t*());
+    auto s16x8 = vmovl_s8(s8x8);
+
+    auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
+
+    return Vectorized<float>(vcvtq_f32_s32(s32x4_lo));
+}
+
+Vectorized<float>
+inline convert_int8_half_register_to_float(at::vec::Vectorized<uint8_t> src) {
+    auto u8x8 = vld1_u8(src.operator const uint8_t*());
+    auto u16x8 = vmovl_u8(u8x8);
+    auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
+
+    return Vectorized<float>(vcvtq_f32_u32(u32x4_lo));
 }
 
 #endif
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 8427f7278986b..843e2dfcb8795 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -40,6 +40,9 @@ template <> class Vectorized<float> {
     values = _mm512_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8,
                             val9, val10, val11, val12, val13, val14, val15, val16);
   }
+  Vectorized(const float (&arr)[16])
+      : Vectorized(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7],
+                   arr[8], arr[9], arr[10], arr[11], arr[12], arr[13], arr[14], arr[15]) {}
   operator __m512() const {
     return values;
   }
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index 3b38a6c3557a4..2b29caf5edd61 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -62,6 +62,16 @@ Windows llvm will not have this defination.
 #endif
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
+#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
+// SVE code expects 256-vectors; leave that set for SVE?
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
 #else // CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
@@ -138,40 +148,10 @@ struct Vectorized {
 public:
   using value_type = T;
   using size_type = int;
-  // Note [constexpr static function to avoid odr-usage compiler bug]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // Why, you might ask, is size defined to be a static constexpr function,
-  // rather than a more ordinary 'static constexpr int size;' variable?
-  // The problem lies within ODR rules for static constexpr members versus
-  // static constexpr functions.  First, recall that this class (along with all
-  // of its derivations) live in an anonymous namespace: they are intended to be
-  // *completely* inlined at their use-sites, because we need to compile it
-  // multiple times for different instruction sets.
-  //
-  // Because of this constraint, we CANNOT provide a single definition for
-  // any static members in this class; since we want to compile the class
-  // multiple times, there wouldn't actually be any good place to put the
-  // definition.  Now here is the problem: if we ODR-use a static constexpr
-  // member, we are *obligated* to provide a definition.  Without the
-  // definition, you get a compile error like:
-  //
-  //    relocation R_X86_64_PC32 against undefined symbol
-  //    `_ZN2at6vec25612_GLOBAL__N_16VectorizedIdE4sizeE' can not be used when making
-  //    a shared object; recompile with -fPIC
-  //
-  // If this were C++17, we could replace a static constexpr variable with
-  // an inline variable which doesn't require one definition. But we are not
-  // C++17.  So the next best thing is to replace the member with a static
-  // constexpr (and therefore inline) function, which does not require ODR
-  // either.
-  //
-  // Also, technically according to the C++ standard, we don't have to define
-  // a constexpr variable if we never odr-use it.  But it seems that some
-  // versions GCC/Clang have buggy determinations on whether or not an
-  // identifier is odr-used or not, and in any case it's hard to tell if
-  // a variable is odr-used or not.  So best to just cut the problem at the root.
+
+  static constexpr size_type kSize = VECTOR_WIDTH / sizeof(T);
   static constexpr size_type size() {
-    return VECTOR_WIDTH / sizeof(T);
+    return kSize;
   }
   Vectorized() : values{static_cast<T>(0)} {}
   Vectorized(T val) {
@@ -183,6 +163,9 @@ struct Vectorized {
            typename = std::enable_if_t<(sizeof...(Args) == size())>>
   Vectorized(Args... vals) : values{vals...}{
   }
+  Vectorized(const T(&arr)[kSize]) {
+    std::memcpy(values, arr, sizeof(values));
+  }
   // This also implies const T& operator[](int idx) const
   inline operator const T*() const {
     return values;
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index e2b64013a77d5..bc480e781cc81 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1747,6 +1747,7 @@ namespace {
       } while (0)
       TEST_CONVERT_TO(int8_t);
       TEST_CONVERT_TO(uint8_t);
+      TEST_CONVERT_TO(float);
     #undef TEST_CONVERT_TO
     }
 #endif
diff --git a/setup.py b/setup.py
index 0bd12aacacfce..3aec923516133 100644
--- a/setup.py
+++ b/setup.py
@@ -1252,6 +1252,7 @@ def main():
         "include/*.h",
         "include/ATen/*.h",
         "include/ATen/cpu/*.h",
+        "include/ATen/cpu/vec/vec128/*.h",
         "include/ATen/cpu/vec/vec256/*.h",
         "include/ATen/cpu/vec/vec256/vsx/*.h",
         "include/ATen/cpu/vec/vec256/zarch/*.h",
diff --git a/test/test_mps.py b/test/test_mps.py
index 6a9adf7decd8c..4540e154ccfa9 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -12078,6 +12078,10 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.triplet_margin_loss',
         'nn.functional.triplet_margin_with_distance_loss',
         'nn.functional.batch_norm',
+        # NOTE: nn.functional.group_norm is here because 1 ULP difference in the mean
+        # output from the forward pass (tolerable) blew up into 8 ULP difference from
+        # the backward pass, and MPS uses fp16 accumulation anyway.
+        'nn.functional.group_norm',
         'nn.functional.instance_norm',
         'round', 'xlogy', 'addcmul',
         'nn.functional.cross_entropy',
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 64a175de63997..ceaa9c8cdb1cf 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -3091,7 +3091,12 @@ def gen_transposed_tile_load_store(self, name, var, index, is_store):
             tile_var = self.cse.cache[load_or_store]
 
         if need_define:
-            define_line = f"alignas({factor}) {DTYPE_TO_CPP[dtype]} {tile_var}[{factor}*{factor}];"
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            # tiling_factor might be smaller than the alignment of cpp_dtype, such as
+            # with a vector that only holds 4 elements due to NEON 128-bit vectors and
+            # cpp_dtype being a 64-bit integer.
+            alignas = f"alignas(std::max(std::size_t({factor}), alignof({cpp_dtype})))"
+            define_line = f"{alignas} {cpp_dtype} {tile_var}[{factor}*{factor}];"
             self.preloads.writeline(define_line)
 
         load_or_store = load_or_store.replace("__place_holder__", str(tile_var))
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index 9b05a35d4e190..c249c6311b753 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -154,10 +154,10 @@ def __bool__impl(self, vec_isa_ok) -> bool:
 
 @dataclasses.dataclass
 class VecNEON(VecISA):
-    _bit_width = 256  # This is required to leverage the compute implemented in aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+    _bit_width = 128  # This is required to leverage the compute implemented in aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
     _macro = ["CPU_CAPABILITY_NEON", "AT_BUILD_ARM_VEC256_WITH_SLEEF"]
     _arch_flags = ""  # Unused
-    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+    _dtype_nelements = {torch.float: 4, torch.bfloat16: 8, torch.float16: 8}
 
     def __str__(self) -> str:
         return "asimd"  # detects the presence of advanced SIMD on armv8-a kernels

From e299193423a85d6c72e76061b5e084cd0146e2a3 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sat, 26 Oct 2024 01:17:11 +0000
Subject: [PATCH 099/161] Bug fix: Use oneDNN for `torch._int_mm` CPU only when
 avx512_vnni is supported (#136942)

Fixes #136746

If AVX512_VNNI is not supported, overflow occurs inside oneDNN. Fall back to ref path in such case.
UT is also updated to catch the issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136942
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 aten/src/ATen/native/LinearAlgebra.cpp | 3 ++-
 test/test_linalg.py                    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 1016ed8606b2e..abc65ae5c6772 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -20,6 +20,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Utils.h>
+#include <ATen/cpu/Utils.h>
 #include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
@@ -3558,7 +3559,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
   }
 
   bool dispatched = false;
-  if (at::globalContext().userEnabledMkldnn()) {
+  if (at::globalContext().userEnabledMkldnn() && at::cpu::is_avx512_vnni_supported()) {
     try {
       mkldnn_matmul_i8i8i32(self, mat2, result);
       dispatched = true;
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 32b7ca69de6a7..e9a7aa0fa6c76 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -6435,7 +6435,7 @@ def genf_int_float(x, y, use_transpose, non_contig_type):
                 x, y = y, x
             if non_contig_type != 0:
                 y = y * 2
-            x_int8 = torch.randint(-10, 10, (x, y), dtype=torch.int8, device=device)
+            x_int8 = torch.randint(-128, 127, (x, y), dtype=torch.int8, device=device)
             x_float = x_int8.to(torch.float32)
             if non_contig_type == 1:
                 x_int8 = x_int8[:, : y // 2]

From 565a53d3261c37a66a0e977b7f3a3ceee50effc2 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Sat, 26 Oct 2024 01:27:02 +0000
Subject: [PATCH 100/161] Use DLPack for creating tensors out of custom
 classes, when available. (#138697)

Fixes #120614
Takes over #120615

In summary, this PR:
- Adds a `__dlpack__` attribute check in the tensor creation path (i.e. [`internal_new_from_data` @ tensor_new.cpp](https://github.com/pytorch/pytorch/blob/cdfe1bffd16bdd28adbe5518038f68e6ac45de8d/torch/csrc/utils/tensor_new.cpp#L266))
    - Creates the tensor by using the DLPack machinery, instead of an element-by-element copy
    - No changes since #120615
- Adds a test, making sure the DLPack machinery is used
    - Wraps a tensor in a fresh `TensorDLPackWrapper` class that implements only the DLPack methods
    - Creates a new tensor from an instance of `TensorDLPackWrapper`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138697
Approved by: https://github.com/ezyang

Co-authored-by: Wenzel Jakob <wenzel.jakob@epfl.ch>
---
 test/test_dlpack.py             | 30 ++++++++++++++++++++++++++++++
 torch/csrc/utils/tensor_new.cpp | 17 +++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index a9036be160b0a..fe1107ac850fc 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -15,6 +15,23 @@
 from torch.utils.dlpack import from_dlpack, to_dlpack
 
 
+# Wraps a tensor, exposing only DLPack methods:
+#    - __dlpack__
+#    - __dlpack_device__
+#
+# This is used for guaranteeing we are going through the DLPack method, and not
+# something else, e.g.: CUDA array interface, buffer protocol, etc.
+class TensorDLPackWrapper:
+    def __init__(self, tensor):
+        self.tensor = tensor
+
+    def __dlpack__(self, *args, **kwargs):
+        return self.tensor.__dlpack__(*args, **kwargs)
+
+    def __dlpack_device__(self, *args, **kwargs):
+        return self.tensor.__dlpack_device__(*args, **kwargs)
+
+
 class TestTorchDlPack(TestCase):
     exact_dtype = True
 
@@ -251,6 +268,19 @@ def test_dlpack_normalize_strides(self):
         # gh-83069, make sure __dlpack__ normalizes strides
         self.assertEqual(z.stride(), (1,))
 
+    @skipMeta
+    @onlyNativeDeviceTypes
+    def test_automatically_select_in_creation(self, device):
+        # Create a new tensor, and wrap it using TensorDLPackWrapper.
+        tensor = torch.rand(10)
+        wrap = TensorDLPackWrapper(tensor)
+        # Create a new tensor from the wrapper.
+        # This should identify that the wrapper class provides the DLPack methods
+        # and use them for creating the new tensor, instead of iterating element
+        # by element.
+        new_tensor = torch.tensor(wrap)
+        self.assertEqual(tensor, new_tensor)
+
 
 instantiate_device_type_tests(TestTorchDlPack, globals())
 
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 099991f841480..e6371498314ba 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -345,6 +345,23 @@ Tensor internal_new_from_data(
   }
 #endif
 
+  if (PyObject_HasAttrString(data, "__dlpack__")) {
+    py::object tensor_o =
+        py::module::import("torch").attr("utils").attr("dlpack").attr(
+            "from_dlpack")(py::handle(data));
+    Tensor tensor = py::cast<Tensor>(tensor_o);
+    const auto& inferred_scalar_type =
+        type_inference ? tensor.scalar_type() : scalar_type;
+    auto device = device_opt.has_value() ? *device_opt : tensor.device();
+    pybind11::gil_scoped_release no_gil;
+    maybe_initialize_device(device);
+    return tensor.to(
+        device,
+        inferred_scalar_type,
+        /*non_blocking=*/false,
+        /*copy=*/copy_variables);
+  }
+
   auto device = device_opt.has_value() ? *device_opt : options.device();
 
   auto sizes = compute_sizes(data, scalar_type);

From 4af93fdb77187ab33cc024342ac90c3e37c96785 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sat, 26 Oct 2024 01:55:29 +0000
Subject: [PATCH 101/161] [BE]: Update cudnn_frontend submodule to 1.8.0
 (#138709)

Update cudnn frontend. Let's see what breaks

@eqy
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138709
Approved by: https://github.com/eqy
---
 third_party/cudnn_frontend | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index 2533f5e5c1877..936021bfed8c9 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit 2533f5e5c1877fd76266133c1479ef1643ce3a8b
+Subproject commit 936021bfed8c91dc416af1588b2c4eca631a9e45

From 1e1f0ceb40bd32733e0c189e02a6e71d7fb16dc1 Mon Sep 17 00:00:00 2001
From: Ryan Guo <ryanguo99@meta.com>
Date: Thu, 24 Oct 2024 17:26:19 -0700
Subject: [PATCH 102/161] Allow Lazy Module to be modelled as
 `UnspecializedNNModuleVariable` (#138639)

This patch
- removes the `is_lazy_module` check from `is_dynamic_nn_module`, and
  adds a regression test.
- removes a series of dynamo expected failures on lazy modules. The few
  ones I checked all were failing due to speculation log divergence,
  similar to #138489.

Note that #100047 introduced the conditional removed in this patch, and
it was trying to fix #100001. But I've confirmed locally that #100001 no
longer repros after this patch.

Fixes #138489. See more context in the issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138639
Approved by: https://github.com/jansel
---
 test/dynamo/test_modules.py                   | 39 +++++++++++++++++++
 .../TestLazyModules.test_lazy_conv1d          |  0
 .../TestLazyModules.test_lazy_conv1d_pickle   |  0
 .../TestLazyModules.test_lazy_conv2d          |  0
 .../TestLazyModules.test_lazy_conv2d_pickle   |  0
 .../TestLazyModules.test_lazy_conv3d          |  0
 .../TestLazyModules.test_lazy_conv3d_pickle   |  0
 ...yModules.test_lazy_conv_transpose1d_kwargs |  0
 ...yModules.test_lazy_conv_transpose1d_pickle |  0
 ...TestLazyModules.test_lazy_conv_transpose2d |  0
 ...yModules.test_lazy_conv_transpose2d_kwargs |  0
 ...yModules.test_lazy_conv_transpose2d_pickle |  0
 ...TestLazyModules.test_lazy_conv_transpose3d |  0
 ...yModules.test_lazy_conv_transpose3d_kwargs |  0
 ...yModules.test_lazy_conv_transpose3d_pickle |  0
 ...estLazyModules.test_lazy_conv_transposed1d |  0
 .../TestLazyModules.test_lazy_linear_pickle   |  0
 .../TestLazyModules.test_linear               |  0
 torch/_dynamo/mutation_guard.py               |  4 +-
 19 files changed, 40 insertions(+), 3 deletions(-)
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_kwargs
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_kwargs
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_kwargs
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transposed1d
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_lazy_linear_pickle
 delete mode 100644 test/dynamo_expected_failures/TestLazyModules.test_linear

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 349bc42498e19..5e1c1369b423e 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1613,6 +1613,45 @@ def test_lazy_module_kwargs(self):
         exp_res = m(x, y)
         self.assertTrue(torch.allclose(exp_res, opt_m(x, y)))
 
+    # RuntimeError: SymIntArrayRef expected to contain only concrete integers
+    @expectedFailureDynamic
+    def test_lazy_module_speculation_log_divergence(self):
+        class ModWithOneLazyLinear(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer = torch.nn.LazyLinear(8)
+
+            def forward(self, x):
+                return self.layer(x)
+
+        # This allows us to restart tracing without clearing speculation log
+        def id_and_fail_inlining(x):
+            torch._dynamo.graph_break()
+            return x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnt)
+        def test(mod, x):
+            res = mod(x)
+            # Speculation log must not diverge in the 2nd round of tracing,
+            # after we've initialized the `LazyLinear` into a `Linear` in the
+            # 1st round.
+            res2 = id_and_fail_inlining(res)
+            return res
+
+        mod = ModWithOneLazyLinear()
+        x = torch.ones(10, 3)
+
+        # Make sure we don't get recompilation across multiple runs
+        actual_res = test(mod, x)
+        expect_res = mod(x)
+        self.assertTrue(torch.allclose(expect_res, actual_res))
+        actual_res = test(mod, x)
+        expect_res = mod(x)
+        self.assertTrue(torch.allclose(expect_res, actual_res))
+        self.assertEqual(cnt.frame_count, 1)
+
     def test_call_fn_with_non_const_inputs_safe(self):
         class ModuleSpecialFwd(torch.nn.Module):
             def __init__(self) -> None:
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv1d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv2d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv3d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_kwargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose1d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_kwargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose2d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_kwargs b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_kwargs
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transpose3d_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transposed1d b/test/dynamo_expected_failures/TestLazyModules.test_lazy_conv_transposed1d
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_lazy_linear_pickle b/test/dynamo_expected_failures/TestLazyModules.test_lazy_linear_pickle
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/test/dynamo_expected_failures/TestLazyModules.test_linear b/test/dynamo_expected_failures/TestLazyModules.test_linear
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/torch/_dynamo/mutation_guard.py b/torch/_dynamo/mutation_guard.py
index a773ca334bfbb..bdc24c421dba4 100644
--- a/torch/_dynamo/mutation_guard.py
+++ b/torch/_dynamo/mutation_guard.py
@@ -6,7 +6,7 @@
 from torch.nn import Module
 
 from . import config
-from .utils import ExactWeakKeyDictionary, is_lazy_module, nn_module_has_global_hooks
+from .utils import ExactWeakKeyDictionary, nn_module_has_global_hooks
 
 
 unpatched_nn_module_init = torch.nn.Module.__init__
@@ -99,8 +99,6 @@ def is_dynamic_nn_module(obj: Any, is_export: bool) -> bool:
         return True
     if hasattr(obj, "torchdynamo_force_dynamic"):
         return obj.torchdynamo_force_dynamic
-    if is_lazy_module(obj):
-        return False
     # For export, we will have to fix
     # 1) Input signature problem because params are lifted as inputs
     # 2) nn module stack info changes

From f14247d5aa0714d7cb63ed6961999a0b15ce8f23 Mon Sep 17 00:00:00 2001
From: Ryan Guo <ryanguo99@meta.com>
Date: Thu, 24 Oct 2024 17:26:20 -0700
Subject: [PATCH 103/161] [dynamo] Accurately identify mutated cells captured
 by multiple functions (#138632)

This patch changes `mutated_closure_cell_contents: Set[str]` to
`mutated_closure_cell_ids: Set[int]` so that Dynamo can more accurately
identify closure cells across different instances of
`UserFunctionVariable`. This prevents Dynamo from mistakenly treat a
cell as immutable, despite it'll be mutated when referenced as closure
cell from another function.

More context in
https://github.com/pytorch/pytorch/issues/138112#issuecomment-2420580779.

Fixes #138112.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138632
Approved by: https://github.com/jansel
ghstack dependencies: #138639
---
 test/dynamo/test_misc.py             | 27 +++++++++++++++++++++++++++
 torch/_dynamo/convert_frame.py       |  4 ++--
 torch/_dynamo/symbolic_convert.py    | 24 +++++++++++-------------
 torch/_dynamo/variables/functions.py |  9 +++++----
 4 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 2c1ccf9eb31bb..5f786e6c249bb 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -3768,6 +3768,33 @@ def deep(x):
         self.assertTrue(torch.allclose(exp1, actual1))
         self.assertTrue(torch.allclose(exp2, actual2))
 
+    def test_closure_write_across_functions(self):
+        z = 1
+        k = 2
+
+        def create_fn():
+            def fn(x):
+                nonlocal k, z
+                k = z
+
+            return fn
+
+        def update_z_and_run_fn(fn, x):
+            nonlocal z
+            z = 3
+            fn(x)
+            return x.cos()
+
+        @torch.compile(backend="eager")
+        def foo(x):
+            fn = create_fn()
+            return update_z_and_run_fn(fn, x)
+
+        x = torch.randn(1)
+        foo(x)
+        self.assertEqual(3, z)
+        self.assertEqual(3, k)
+
     def test_top_package_import(self):
         def fn(x):
             import torch.fx
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 856e399a6d8af..6a313b08c64c8 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -648,7 +648,7 @@ def transform(
             one_graph,
             export,
             export_constraints,
-            mutated_closure_cell_contents,
+            mutated_closure_cell_ids,
             frame_state=frame_state,
             speculation_log=speculation_log,
             distributed_state=distributed_state,
@@ -865,7 +865,7 @@ def count_args(code: CodeType) -> int:
     ):
         restart_reasons: set[str] = set()
         # This is shared across restarts
-        mutated_closure_cell_contents: Set[str] = set()
+        mutated_closure_cell_ids: Set[int] = set()
         speculation_log = SpeculationLog()
         if compile_pg := get_compile_pg():
             distributed_state = DistributedState(compile_pg, LocalState())
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 40f05e94891cb..09c2c59e60944 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -2759,7 +2759,8 @@ def __init__(
 
 
 class InstructionTranslator(InstructionTranslatorBase):
-    mutated_closure_cell_contents: Set[str]
+    mutated_closure_cell_ids: Set[int]
+    contents_var_to_mutated_cell: Dict[VariableTracker, Any]
 
     @staticmethod
     def current_tx() -> "InstructionTranslator":
@@ -2787,7 +2788,7 @@ def __init__(
         one_graph,
         export,
         export_constraints,
-        mutated_closure_cell_contents: Set[str],
+        mutated_closure_cell_ids: Set[int],
         frame_state,
         speculation_log: SpeculationLog,
         distributed_state: Optional[DistributedState],
@@ -2832,7 +2833,8 @@ def __init__(
         with tracing(self.output.tracing_context), self.set_current_tx():
             self.one_graph: bool = one_graph
             self.export = export
-            self.mutated_closure_cell_contents = mutated_closure_cell_contents
+            self.mutated_closure_cell_ids = mutated_closure_cell_ids
+            self.contents_var_to_mutated_cell = {}
             if self.export:
                 assert (
                     self.one_graph
@@ -3350,19 +3352,15 @@ def STORE_DEREF(self, inst):  # type: ignore[override]
                     self.symbolic_locals[inst.argval], self.pop()
                 )
             else:
+                root_tx = self.output.root_tx
                 if (
                     maybe_cell is not None
-                    and maybe_cell.source.name()
-                    not in self.output.root_tx.mutated_closure_cell_contents
+                    and maybe_cell in root_tx.contents_var_to_mutated_cell
+                    and id(root_tx.contents_var_to_mutated_cell[maybe_cell])
+                    not in root_tx.mutated_closure_cell_ids
                 ):
-                    # Why is the source name here unique?
-                    # mutated_closure_cell_contents is a per-frame
-                    # concept, and sources identify, e.g., particular
-                    # locals from the frame.  If you had two locals,
-                    # they'll get different source names, and therefore
-                    # differ here.
-                    self.output.root_tx.mutated_closure_cell_contents.add(
-                        maybe_cell.source.name()
+                    self.output.root_tx.mutated_closure_cell_ids.add(
+                        id(root_tx.contents_var_to_mutated_cell[maybe_cell])
                     )
                     raise exc.UnspecializeRestartAnalysis
                 unimplemented("write to __closure__ while inlining")
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 1e7d4ab49e0a8..188a6fe5cb8eb 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -274,10 +274,7 @@ def bind_args(self, parent, args, kwargs):
                             # Cell has not yet been assigned
                             contents_var = variables.DeletedVariable()
 
-                        if (
-                            closure_cell_contents.name()
-                            not in tx.mutated_closure_cell_contents
-                        ):
+                        if id(cell) not in tx.mutated_closure_cell_ids:
                             # Optimistically don't allocate the cell, to
                             # reduce the number of side effects.  This is
                             # important for cond, as without it, any accesses
@@ -287,6 +284,10 @@ def bind_args(self, parent, args, kwargs):
                             # the analysis with this cell's name in the
                             # mutated list here
                             result[name] = contents_var
+                            # Map the variable to the original cell so we can
+                            # look it up later, see
+                            # `InliningInstructionTranslator.STORE_DEREF`.
+                            tx.contents_var_to_mutated_cell[contents_var] = cell
                             continue
 
                         # cells are written to with "cell_contents",

From 0ac9a663ecc75355f8019841ad5db2e22b77f3ca Mon Sep 17 00:00:00 2001
From: Yidi Wu <yidi@meta.com>
Date: Thu, 24 Oct 2024 13:15:35 -0700
Subject: [PATCH 104/161] [hop] always trace subgraph with fake to support
 .item in eager mode (#138771)

Fixes https://github.com/pytorch/pytorch/issues/138664

When we eagerly run torch.cond with autograd keys set, we'll create_fw_bw_graph using real tensors. This PR forces fakification when cannot detect the fake mode so as to trace the .item calls.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138771
Approved by: https://github.com/zou3519, https://github.com/malfet
---
 test/functorch/test_control_flow.py | 38 +++++++++++++++++++++++++++++
 torch/_higher_order_ops/cond.py     |  8 ++++++
 torch/_higher_order_ops/utils.py    | 18 +++++++++++++-
 3 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 200902b3dea44..b6c0e103dfee4 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -5524,6 +5524,44 @@ def test_while_loop_schema_gen(self):
         )
         self.assertEqual(schema.parse(str(schema)), schema)
 
+    @skipIfTorchDynamo("Skip because dynamo cannot trace torch.export.")
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_cond_eager_run_with_item(self):
+        class M(torch.nn.Module):
+            def forward(self, a, b1, b2, c):
+                def true_fn(x):
+                    return x * b1.item()
+
+                def false_fn(x):
+                    return x * b2.item()
+
+                r = torch.cond(a, true_fn, false_fn, (c,))
+                return r * 2
+
+        x = torch.randn(10, requires_grad=True)
+        args = (
+            torch.tensor(True),
+            torch.tensor([3]),
+            torch.tensor([4]),
+            x,
+        )
+        model = M()
+        ep = torch.export.export(model, args)
+        self.assertExpectedInline(
+            ep.module().code.strip(),
+            """\
+def forward(self, a, b1, b2, c):
+    a, b1, b2, c, = fx_pytree.tree_flatten_spec(([a, b1, b2, c], {}), self._in_spec)
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, [c, b1, b2]);  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
+    getitem = cond[0];  cond = None
+    mul = torch.ops.aten.mul.Tensor(getitem, 2);  getitem = None
+    return pytree.tree_unflatten((mul,), self._out_spec)""",  # noqa: B950
+        )
+        expected_output = model(*args)
+        self.assertEqual(expected_output, x * 3 * 2)
+
 
 instantiate_parametrized_tests(TestHopSchema)
 instantiate_parametrized_tests(TestControlFlowTraced)
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index d9c4d0ed8b109..f0a260de40681 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -444,6 +444,14 @@ def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
         raise RuntimeError("Unmatched number of outputs from cond() branches.")
 
     for true_out, false_out in zip(flat_true_outs, flat_false_outs):
+        if true_out is None or false_out is None:
+            if true_out is None and false_out is None:
+                continue
+            raise torch._dynamo.exc.CondOpArgsMismatchError(
+                f"Expected both branches to return None:"
+                f"\n  {true_fn.__name__} returns {true_out}"
+                f"\n  {false_fn.__name__} returns {false_out}"
+            )
         true_meta = _extract_tensor_metadata(true_out)
         false_meta = _extract_tensor_metadata(false_out)
         if true_meta != false_meta:
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 389a51aed58e5..549e1af54f9b6 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -99,7 +99,23 @@ def _maybe_reenter_make_fx(fn):
     if _CURRENT_MAKE_FX_TRACER is not None:
         return reenter_make_fx(fn)
     else:
-        return make_fx(fn)
+
+        def _maybe_make_fx_with_fake_mode(fn):
+            @functools.wraps(fn)
+            def wrapped(*args):
+                from torch._guards import detect_fake_mode
+
+                fake_mode = detect_fake_mode(args)
+                if fake_mode is None:
+                    # we creaeta a fake_mode here to make sure we could
+                    # trace the graph with data-dependent calls e.g. .item()
+                    return make_fx(fn, tracing_mode="fake")(*args)
+                # Tracing with real if all inputs have been fakfied
+                return make_fx(fn)(*args)
+
+            return wrapped
+
+        return _maybe_make_fx_with_fake_mode(fn)
 
 
 @contextmanager

From 940658405bbe08beb3e8e4ff2f76c3f8d17e504b Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@google.com>
Date: Sat, 26 Oct 2024 02:42:25 +0000
Subject: [PATCH 105/161] [test/test_cuda] Use temp file for
 test_improper_device_name (#138856)

Use `tempfile.NamedTemporaryFile()` to have test_specify_improper_device_name save/load to a tmp file rather than the current-working-directory
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138856
Approved by: https://github.com/Skylion007
---
 test/test_cuda.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 91f0190865a63..fc041b5ad9eb6 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -590,10 +590,8 @@ def test_manual_seed(self):
             self.assertEqual(torch.cuda.initial_seed(), 2)
 
     def test_specify_improper_device_name(self):
-        import os
-
-        fname = "tempfile.pt"
-        try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            fname = os.path.join(tmpdir, "tempfile.pt")
             with self.assertRaisesRegex(RuntimeError, "Invalid device string"):
                 torch.save(
                     [torch.nn.Parameter(torch.randn(10, 10))],
@@ -601,9 +599,6 @@ def test_specify_improper_device_name(self):
                     _use_new_zipfile_serialization=True,
                 )
                 torch.load(fname, "cuda0")
-        finally:
-            if os.path.exists(fname):
-                os.remove(fname)
 
     def test_get_device_index(self):
         from torch.cuda._utils import _get_device_index

From 00504aa6b8b0ae68761b89f023184202e8c79bc8 Mon Sep 17 00:00:00 2001
From: Syed Tousif Ahmed <syeahmed@nvidia.com>
Date: Thu, 24 Oct 2024 11:17:58 -0700
Subject: [PATCH 106/161] Adds snapshot API for MemPools to get pool memory
 segments (#133601)

Canonically, the snapshot API returns the entire memory state of the CUDACachingAllocator (using `get_all_blocks`). There is no API that can only return the memory state of a given pool.

In this PR, we extend the functionality of snapshot API such that it can only return the memory addresses of an active pool. When snapshot API is called under a MemPoolContext, we only return the blocks that correspond to the pool id of the active pool.

Part of https://github.com/pytorch/pytorch/issues/124807.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/133601
Approved by: https://github.com/ezyang
---
 c10/cuda/CUDACachingAllocator.cpp | 39 +++++++++++++++++++-----
 test/test_cuda.py                 | 20 ++++++++++++-
 torch/cuda/memory.py              | 50 +++++++++++++++++++++----------
 3 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 245a180cecddb..4dc62366be238 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1897,16 +1897,41 @@ class DeviceCachingAllocator {
 
     std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
     pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
-    for (const auto& pair : graph_pools) {
-      pool_to_id[pair.second.get()] = pair.first;
+    std::vector<Block*> all_blocks;
+    MempoolId_t mempool_id = {0, 0};
+
+    auto active_mempool = MemPoolContext::getActiveMemPool();
+    if (active_mempool) {
+      mempool_id = active_mempool->id();
     }
-    for (const auto& pair : graph_pools_freeable) {
-      pool_to_id[pair.second] = pair.first;
+
+    if (mempool_id.first != 0 || mempool_id.second != 0) {
+      // If there is an active mempool, we find the corresponding PrivatePool
+      // in graph_pools and only return the blocks from it.
+      auto pool = graph_pools.find(mempool_id);
+      if (pool != graph_pools.end()) {
+        pool_to_id[pool->second.get()] = pool->first;
+        all_blocks = get_private_pool_head_blocks(pool->second.get());
+      }
+      auto pool_freeable = graph_pools_freeable.find(mempool_id);
+      if (pool_freeable != graph_pools_freeable.end()) {
+        pool_to_id[pool_freeable->second] = pool_freeable->first;
+      }
+    } else {
+      // When snapshot is called outside a MemPoolContext, we return
+      // all the blocks in the CUDACachingAllocator (as returned by
+      // get_all_blocks).
+      for (const auto& pair : graph_pools) {
+        pool_to_id[pair.second.get()] = pair.first;
+      }
+      for (const auto& pair : graph_pools_freeable) {
+        pool_to_id[pair.second] = pair.first;
+      }
+      all_blocks = get_all_blocks();
     }
 
     size_t total_active = 0;
     std::vector<SegmentInfo> result;
-    const auto all_blocks = get_all_blocks();
 
     for (const Block* const head_block : all_blocks) {
       // For expandable segments, we report one segment for each contiguous
@@ -2109,8 +2134,8 @@ class DeviceCachingAllocator {
  private:
   // All private methods do not acquire the allocator mutex.
 
-  std::vector<const Block*> get_all_blocks() const {
-    std::vector<const Block*> blocks;
+  std::vector<Block*> get_all_blocks() const {
+    std::vector<Block*> blocks;
     blocks.insert(
         blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
     blocks.insert(
diff --git a/test/test_cuda.py b/test/test_cuda.py
index fc041b5ad9eb6..a0a5edba32ffd 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4542,9 +4542,10 @@ def test_mempool_with_allocator(self):
         alloc_lib = ctypes.CDLL(dummy_allocator)
         called_dummy_alloc = ctypes.c_int.in_dll(alloc_lib, "called_dummy_alloc")
         self.assertEqual(called_dummy_alloc.value, 0)
+        nelem_1mb = 1024 * 1024 // 4
 
         with torch.cuda.use_mem_pool(pool):
-            out_0 = torch.randn(1, device="cuda")
+            out_0 = torch.randn(nelem_1mb, device="cuda")
 
             # pool's use count should be 2 at this point as use_mem_pool
             # holds a reference
@@ -4558,6 +4559,23 @@ def test_mempool_with_allocator(self):
         # out tensor
         self.assertEqual(called_dummy_alloc.value, 123)
 
+        with torch.cuda.use_mem_pool(pool):
+            # pool should have 1 segment since we made a small allocation (1 MB)
+            # above and so the CUDACachingAllocator packed it into a 2 MB buffer
+            self.assertEqual(len(pool.snapshot()), 1)
+
+            out_1 = torch.randn(nelem_1mb, device="cuda")
+
+            # pool should still have 1 segment since we made another small allocation
+            # (1 MB) that got packed into the existing 2 MB buffer
+            self.assertEqual(len(pool.snapshot()), 1)
+
+            out_2 = torch.randn(nelem_1mb, device="cuda")
+
+            # pool now should have 2 segments since the CUDACachingAllocator had
+            # to make a new 2 MB buffer to accomodate out_2
+            self.assertEqual(len(pool.snapshot()), 2)
+
     def test_mempool_context(self):
         active_pool = torch.cuda.MemPoolContext.active_pool()
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 521090b0db793..145458de3040f 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -980,6 +980,25 @@ def _get_current_allocator() -> _CUDAAllocator:
     return _CUDAAllocator(torch._C._cuda_getAllocator())
 
 
+class MemPoolContext(_MemPoolContext):
+    r"""MemPoolContext holds the currently active pool and stashes the previous
+    pool. On deletion it makes the previous pool active.
+
+    Args:
+        pool(torch.cuda.MemPool): a MemPool object to be made active so that
+        allocations route to this pool.
+
+    """
+
+    def __init__(self, pool: _MemPool):
+        super().__init__(pool)
+
+    @staticmethod
+    def active_pool() -> Optional[_MemPool]:
+        r"""Returns the active MemPool"""
+        return _MemPoolContext.active_pool()
+
+
 class MemPool(_MemPool):
     r"""MemPool represents a pool of memory in a caching allocator. Currently,
     it's just the ID of the pool object maintained in the CUDACachingAllocator.
@@ -1010,24 +1029,23 @@ def use_count(self) -> int:
         r"""Returns the reference count of this pool."""
         return super().use_count()
 
+    def snapshot(self):
+        r"""Return a snapshot of the CUDA memory allocator pool state across all
+        devices.
 
-class MemPoolContext(_MemPoolContext):
-    r"""MemPoolContext holds the currently active pool and stashes the previous
-    pool. On deletion it makes the previous pool active.
-
-    Args:
-        pool(torch.cuda.MemPool): a MemPool object to be made active so that
-        allocations route to this pool.
-
-    """
+        Interpreting the output of this function requires familiarity with the
+        memory allocator internals.
 
-    def __init__(self, pool: MemPool):
-        super().__init__(pool)
-
-    @staticmethod
-    def active_pool() -> Optional[_MemPool]:
-        r"""Returns the active MemPool"""
-        return _MemPoolContext.active_pool()
+        .. note::
+            See :ref:`cuda-memory-management` for more details about GPU memory
+            management.
+        """
+        try:
+            ctx = MemPoolContext(self)
+            snapshot = torch.cuda.memory_snapshot()
+        finally:
+            del ctx
+        return snapshot
 
 
 @contextlib.contextmanager

From 07e30eae2a8241e531890b6c9a33ab5a80c5ccaf Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Fri, 25 Oct 2024 16:59:20 -0700
Subject: [PATCH 107/161] [PGNCCL] Use non-blocking mode by default in eager
 init (#138527)

### Why use non-blocking mode in eager init?
For overlapping comm init and model init, etc.
![image](https://github.com/user-attachments/assets/9b0bf7a9-be26-4d16-827b-dbe861f083cd)

### Why can we set non-blocking as default?
If the setting is dangling -- i.e. not passed in by user nor set via env -- `ProcessGroupNCCL` can have some preferred logic. And torch-level API semantics does not change whether the NCCL comm is blocking or non-blocking (handled within `ProcessGroupNCCL`).

### Why not make non-blocking default for lazy mode as well?
PR https://github.com/pytorch/pytorch/pull/137544 tried it.
Two reasons why that's not preferred today:
1. It is hard -- too big a blast.
2. There is no gain by doing lazy init in non-blocking mode, because the right next CPU call is a collective, and we will block there waiting for comm to be ready, so same effect as blocked init, no "opening" compared to eager mode.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138527
Approved by: https://github.com/wconstab
ghstack dependencies: #138860
---
 test/distributed/test_c10d_nccl.py            | 64 +++++++++----------
 torch/csrc/cuda/nccl.cpp                      |  7 +-
 torch/csrc/distributed/c10d/NCCLUtils.cpp     | 12 +---
 torch/csrc/distributed/c10d/NCCLUtils.hpp     | 37 ++++++-----
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 48 ++++++++++++--
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  8 +++
 6 files changed, 106 insertions(+), 70 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 6d81901a7a66c..64a210ed3b6c0 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -321,25 +321,30 @@ def abortpg():
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_close_pg(self):
+    @parametrize("eager_init", [True, False])
+    def test_close_pg(self, eager_init: bool):
         # Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
         # abort the process group.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
 
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        device = self.rank_to_GPU[self.rank][0]
+        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device if eager_init else None,
+        )
 
         t = torch.rand(10, 10, device=device)
         # First allreduce to initialize state.
-        pg.allreduce(t)
+        dist.all_reduce(t)
 
         # Destroy pg and validate pg is no longer valid
         dist.destroy_process_group()
-        with self.assertRaises(dist.DistBackendError):
-            pg.allreduce([t])
-
-        del pg
+        with self.assertRaises(ValueError):
+            dist.all_reduce(t)
 
     CUDA_12_AND_ABOVE = torch.cuda.is_available() and (
         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
@@ -803,27 +808,24 @@ def test_extend_nccl_pg_timeout(self, backend):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_comm_lazy_init_split(self):
+    @parametrize("eager_init", [True, False])
+    def test_new_group(self, eager_init: bool):
         # Test the optimization of new groups that contain all world
         # ranks use the "transparent" `ncclCommSplit` optimization.
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-
-        # Test lazy splitting behavior across each per-device backend.
-        for device in self.rank_to_GPU[self.rank]:
-            backend = pg._get_backend(torch.device(device))
-
-            # split doesn't happen unless the original process group has lazily
-            # created communicators, so first verify we haven't split even when
-            # making the new group and running an operation on the original pg.
-            ng = c10d.new_group()
-            tensor = torch.tensor([self.rank]).cuda(device)
-            pg.broadcast(tensor, 0)
-            self.assertEqual(backend.comm_split_count(), 0)
-
-            # The new group will not force a split because it is a lazy init.
-            ng.broadcast(tensor, 0)
-            self.assertEqual(backend.comm_split_count(), 0)
+        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device if eager_init else None,
+        )
+        ng = c10d.new_group()
+        tensor = torch.tensor([self.rank], device=device)
+        dist.broadcast(tensor, 0)
+        dist.broadcast(tensor, 0, group=ng)
+        dist.destroy_process_group()
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -863,15 +865,11 @@ def test_comm_eager_init_subgroup(self):
         pg = self._create_process_group_nccl(store, self.opts())
         backend = pg._get_backend(torch.device(device))
         self.assertEqual(backend._is_initialized(), False)
-
-        tensor = torch.full((1,), self.rank).cuda(device)
+        # create a subgroup eagerly
         new_group = c10d.new_group([0, 1], device_id=device)
-        self.assertEqual(backend.comm_split_count(), 0)
-
-        new_backend = new_group._get_backend(torch.device(device))
-        self.assertEqual(new_backend._is_initialized(), True)
+        tensor = torch.full((1,), self.rank).cuda(device)
         dist.broadcast(tensor, 0, group=new_group)
-        self.assertEqual(new_backend.comm_split_count(), 0)
+        # the default group should stay lazy
         self.assertEqual(backend._is_initialized(), False)
         torch.cuda.synchronize()
         dist.destroy_process_group()
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index a426d9043fa66..7be7b08efc6a6 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -159,7 +159,6 @@ static inline void NCCL_CHECK(ncclResult_t result) {
 }
 
 // TODO(eqy): can this duplication be avoided from NCCLUtils.cpp?
-// Default value: on
 bool nccl_use_nonblocking() {
   static bool nccl_use_nonblocking_ =
       c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
@@ -194,7 +193,8 @@ static inline void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
                            currentTimepoint - startTimepoint)
                            .count();
     if (timeElapsed > nccl_nonblocking_timeout()) {
-      throw std::runtime_error("NCCL timeout.");
+      throw std::runtime_error(
+          "NCCL timeout when waiting for nonblocking call to become successful.");
     }
     sched_yield(); // yield to other threads
     ncclCommGetAsyncError(to_nccl_comm(comm), &result);
@@ -226,7 +226,8 @@ static inline void NCCL_CHECK_TIMEOUT(
                                currentTimepoint - startTimepoint)
                                .count();
         if (timeElapsed > nccl_nonblocking_timeout()) {
-          throw std::runtime_error("NCCL timeout.");
+          throw std::runtime_error(
+              "NCCL timeout when waiting for nonblocking call to become successful.");
         }
         sched_yield(); // yield to other threads
         ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 6bbb2318ba8ab..a86039c6ef4d4 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -31,7 +31,7 @@ ncclComm_t NCCLComm::getNcclComm() {
             commFailureMsg));
   }
   // In non-blocking mode, ensure comm is ready.
-  if (nccl_use_nonblocking()) {
+  if (nonBlocking_) {
     // If timeout is reached, throw an exception.
     C10D_NCCL_CHECK_TIMEOUT_SLEEP(ncclInProgress, ncclComm_, std::nullopt);
     // ncclComm_ should be initialized by now
@@ -101,6 +101,7 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 #endif
   ++source->ncclCommSplitCounter_;
   comm->rank_ = rank;
+  comm->nonBlocking_ = config.blocking == 0;
   LOG(INFO) << "Rank " << source->rank_ << ": created child comm "
             << comm->repr() << " with color_id " << color_id;
   return comm;
@@ -163,15 +164,6 @@ size_t hashTensors(const std::vector<at::Tensor>& tensors) {
 }
 #endif
 
-bool nccl_use_nonblocking() {
-  static bool nccl_use_nonblocking_ =
-      c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
-  if (nccl_use_nonblocking_) {
-    TORCH_WARN_ONCE("Using experimental non-blocking NCCL communicator.");
-  }
-  return nccl_use_nonblocking_;
-}
-
 // Default value: 30 minutes
 int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index a5099ab583f97..af32ab83ef57c 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -236,7 +236,6 @@ DEFINE_CONSTANT(started_state, "started");
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
 TORCH_API std::string getNcclVersion();
 TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
-bool nccl_use_nonblocking();
 int nccl_nonblocking_timeout();
 
 // Provides additional detail into NCCL error codes based on when these are
@@ -308,6 +307,8 @@ class NCCLComm {
     comm->ncclId_ = commId;
     comm->rank_ = rank;
     comm->initialized_ = true;
+    // Old style comm is always blocking.
+    comm->nonBlocking_ = false;
     return comm;
   }
 
@@ -318,26 +319,19 @@ class NCCLComm {
       ncclUniqueId commId,
       ncclConfig_t& config) {
     auto comm = std::make_shared<NCCLComm>();
-    bool isInitialized = false;
-    if (nccl_use_nonblocking()) {
-      config.blocking = 0;
-      LOG(INFO) << "Rank " << rank
-                << ": creating NCCL communicator in nonblocking mode";
-      C10D_NCCL_CHECK_NONBLOCKING(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          std::nullopt);
-    } else {
-      C10D_NCCL_CHECK(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          std::nullopt);
-      // under blocking mode, comm is initialized after NCCL CHECK
-      isInitialized = true;
-    }
+    comm->nonBlocking_ = config.blocking == 0;
+    LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
+              << (comm->nonBlocking_ ? "nonblocking" : "blocking");
+    C10D_NCCL_CHECK_NONBLOCKING(
+        ncclCommInitRankConfig(
+            &(comm->ncclComm_), numRanks, commId, rank, &config),
+        std::nullopt);
     comm->ncclId_ = commId;
     comm->rank_ = rank;
-    comm->initialized_ = isInitialized;
+    // Under blocking mode, comm is initialized immediately after NCCL init
+    // returns; Under nonblocking mode, we check whether comm is initialized the
+    // *next* time ncclComm_ is accessed.
+    comm->initialized_ = !comm->nonBlocking_;
     return comm;
   }
 
@@ -382,6 +376,7 @@ class NCCLComm {
     std::swap(aborted_, other.aborted_);
     std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
     std::swap(initialized_, other.initialized_);
+    std::swap(nonBlocking_, other.nonBlocking_);
   }
 
   ncclComm_t getNcclComm();
@@ -550,6 +545,10 @@ class NCCLComm {
   // better error messaging.
   std::optional<std::string> commFailureReason_{};
   bool initialized_{false};
+  // Whether this communicator is using nonblocking mode. Recorded during comm
+  // creation or split. For safety, we give a default value of true (more
+  // protection).
+  bool nonBlocking_{true};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
   std::unordered_map<void*, void*> registeredSegmentHandles_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 6206b4d6c5994..c9564a31f057c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -987,7 +987,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
             << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
             << ", TORCH_DISTRIBUTED_DEBUG: " << torch_distributed_debug
-            << ", TORCH_NCCL_USE_COMM_NONBLOCKING: " << nccl_use_nonblocking()
 #ifdef NCCL_HAS_COMM_REGISTER
             << ", TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: "
             << useTensorRegisterAllocatorHook_
@@ -1059,6 +1058,39 @@ void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
   getNCCLComm(key, device, OpType::ALLREDUCE);
 }
 
+bool ProcessGroupNCCL::useNonblocking() {
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  return false;
+#endif
+  // Already parsed, return the cached value
+  if (useNonblocking_.has_value()) {
+    return useNonblocking_.value();
+  }
+  // Get environment variable.
+  auto nbEnv = c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING");
+
+  // 1st priority: Respect the user's setting
+  if (options_->config.blocking != NCCL_CONFIG_UNDEF_INT) {
+    useNonblocking_ = options_->config.blocking == 0;
+  }
+  // 2nd priority: Respect the environment variable
+  else if (nbEnv.has_value()) {
+    useNonblocking_ = nbEnv.value();
+  }
+  // 3rd priority: automatically use nonblocking if we are in eager init mode
+  else if (getBoundDeviceId()) {
+    useNonblocking_ = true;
+  }
+  // 4th priority: otherwise, nonblocking = false to preserve old behavior
+  else {
+    useNonblocking_ = false;
+  }
+
+  LOG(INFO) << logPrefix()
+            << "Using non-blocking mode: " << useNonblocking_.value();
+  return useNonblocking_.value();
+}
+
 void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   // If our backend doesn't support splitting, this is a no-op for
   // ranks not in the new subgroup (and ranks that would be in it will
@@ -1067,6 +1099,8 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   const auto key = getKeyFromDevice(device);
   LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
             << device << ", key " << key << ", i am " << this;
+  bool useNb = useNonblocking();
+  options_->config.blocking = useNb ? 0 : 1;
   auto comm = getNCCLComm(key, device, OpType::ALLREDUCE);
   NCCLComm::split(
       comm.get(),
@@ -2357,6 +2391,11 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
     rank = p2pRank;
   }
 
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  bool useNb = useNonblocking();
+  options_->config.blocking = useNb ? 0 : 1;
+#endif
+
 #ifdef NCCL_HAS_COMM_SPLIT
   if (options_->split_from) {
     // Find a valid, healthy communicator to split from if possible.
@@ -2773,7 +2812,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     work->ncclStartEvent_->record(ncclStream);
   }
 
-  if (nccl_use_nonblocking()) {
+  if (useNonblocking()) {
     groupEndNonblocking(comm);
   } else {
     groupEnd();
@@ -3093,8 +3132,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 #endif
 
   {
-    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(
-        comm, nccl_use_nonblocking());
+    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(comm, useNonblocking());
     for (const auto i : c10::irange(inputs.size())) {
       // Both `inputs' and `outputs' are created on a worker stream and used in
       // different ncclStreams.  Hence, both must record the ncclStream to
@@ -4662,7 +4700,7 @@ void ProcessGroupNCCL::groupEndNonblocking(
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
 #else
-  if (!nccl_use_nonblocking()) {
+  if (!useNonblocking()) {
     C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
   } else {
     C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comm, std::nullopt);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 5ec9ae32405f6..839463a9d8be1 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -778,6 +778,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Abort all communicators on this rank.
   bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
 
+  // A helper function to check if nonblocking API mode should be used.
+  // Use this helper instead of directly checking `useNonblocking_` variable.
+  bool useNonblocking();
+
  private:
   int globalRankStart;
   int globalRankStride;
@@ -1237,6 +1241,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   std::shared_ptr<ProcessGroupStatus> pgStatus_ =
       std::make_shared<ProcessGroupStatus>();
+
+  // Internal cached value: use NCCL non-blocking API mode or not.
+  // Use `useNonblocking()` method instead of accessing this variable directly.
+  std::optional<bool> useNonblocking_{std::nullopt};
 };
 
 // Dumps the NCCL comm traces and additional information about the Process

From 10e2840ce3bfce775e7b9ea78aaae9bad845d3ef Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 25 Oct 2024 09:50:55 -0700
Subject: [PATCH 108/161] Enable failing diffs on update_hint_regression and
 sum_floordiv_regression and autograd benchmarks regression (#137548)

update_hint_regression has been behaving, so I am setting 2% noise threshold for it. 1.5% for sum_floordiv_regression.

I have one concern, with the way we do the regression detection. small or changes <threshold level  will accumulate and eventually trigger failure. to avoid those would have to keep any eye on the dashboard and potentially refresh the expected result file regularly even when there is no faluires. .

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137548
Approved by: https://github.com/aorenste
---
 .../pr_time_benchmarks/check_results.py       |  2 +
 .../pr_time_benchmarks/expected_results.csv   | 59 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
index b127c0522f39b..80673b7686709 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
@@ -47,6 +47,8 @@ def main():
     with open(expected_file_path) as f:
         reader = csv.reader(f)
         for row in reader:
+            if len(row) == 0:
+                continue
             entry = ExpectedFileEntry(
                 benchmark_name=row[0].strip(),
                 metric_name=row[1].strip(),
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index 847ee267f083e..1605327050975 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -1,9 +1,66 @@
 add_loop_eager,                compile_time_instruction_count, 3004749893,  0.015
+
+
+
 add_loop_eager_dynamic,        compile_time_instruction_count, 5563298740,  0.025
+
+
+
 add_loop_inductor,             compile_time_instruction_count, 24064639114, 0.015
+
+
+
 add_loop_inductor_dynamic_gpu, compile_time_instruction_count, 40992578178, 0.025
+
+
+
 add_loop_inductor_gpu,         compile_time_instruction_count, 22822864522, 0.015
+
+
+
 basic_modules_ListOfLinears_eager,                         compile_time_instruction_count, 1034818091,     0.015
-basic_modules_ListOfLinears_inductor,                      compile_time_instruction_count, 18830023930,    0.015
+
+
+
+basic_modules_ListOfLinears_inductor,                      compile_time_instruction_count, 19049541914,    0.015
+
+
+
 basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,  compile_time_instruction_count, 15806042948,    0.015
+
+
+
 basic_modules_ListOfLinears_inductor_gpu,                  compile_time_instruction_count, 16403080126,    0.20
+
+
+
+
+update_hint_regression,  compile_time_instruction_count,  1853008305,    0.02
+
+
+
+sum_floordiv_regression, compile_time_instruction_count, 1154135694,     0.015
+
+
+
+symint_sum, compile_time_instruction_count, 3270576815,  0.015
+
+
+
+aotdispatcher_inference_nosubclass_cpu, compile_time_instruction_count, 1981730523, 0.015
+
+
+
+aotdispatcher_inference_subclass_cpu, compile_time_instruction_count, 5711895807, 0.015
+
+
+
+aotdispatcher_partitioner_cpu, compile_time_instruction_count, 8963708885    ,  0.015
+
+
+
+aotdispatcher_training_nosubclass_cpu, compile_time_instruction_count, 3795666651,  0.015
+
+
+
+aotdispatcher_training_subclass_cpu, compile_time_instruction_count, 10175364418, 0.015

From 99608ceed660fb308f3ac1226c99be349c3f0b9c Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@meta.com>
Date: Fri, 25 Oct 2024 13:19:47 -0700
Subject: [PATCH 109/161] Scoped extension building for C++ backed custom ops
 tests (#136695)

FIXES #125579 #131103 #133197 #133283 #134738 #135369 #135685

Tests that create C++ extensions can cause flakiness in CI due to library namespace conflict and test ordering. We can build them in temp dirs to ensure isolation.

An alternative is to build these as part of the build process and have build time errors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136695
Approved by: https://github.com/zou3519
---
 test/dynamo/test_misc.py                |  8 +++--
 test/inductor/test_compiled_autograd.py | 48 ++++++++++++++-----------
 test/test_autograd.py                   | 11 +++---
 test/test_custom_ops.py                 | 16 ++++++---
 torch/testing/_internal/common_utils.py | 24 ++++++++++++-
 5 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 5f786e6c249bb..21be24a2678c6 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -79,6 +79,7 @@
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
     IS_FBCODE,
+    scoped_load_inline,
     set_default_dtype,
     skipIfNNModuleInlined,
     skipIfWindows,
@@ -321,16 +322,17 @@ def add_fn(a, b, out):
         res_compiled = add_fn(2, 3, torch.tensor(0.0))
         self.assertEqual(res, res_compiled)
 
+    @scoped_load_inline
     @skipIfNNModuleInlined("fails internal CI")
     @unittest.skipIf(IS_FBCODE, "inline cpp_extension doesn't work in fbcode")
-    def test_cpp_extension_recommends_custom_ops(self):
+    def test_cpp_extension_recommends_custom_ops(self, load_inline):
         cpp_source = """
         #include <torch/extension.h>
         at::Tensor foobar(const at::Tensor& x) {
             return x.clone();
         }
         """
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="mylib",
             cpp_sources=cpp_source,
             functions="foobar",
@@ -362,7 +364,7 @@ def f(x):
             return x.clone();
         }
         """
-        module2 = torch.utils.cpp_extension.load_inline(
+        module2 = load_inline(
             name="mylib2",
             cpp_sources=cpp_source,
             functions="baz",
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 7c09ea9d49e04..80b174cc4ae3e 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -26,7 +26,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config as inductor_config
 from torch._inductor.test_case import run_tests, TestCase
-from torch.testing._internal.common_utils import skipIfWindows
+from torch.testing._internal.common_utils import scoped_load_inline, skipIfWindows
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
 from torch.testing._internal.logging_utils import logs_to_string
 
@@ -1586,7 +1586,8 @@ def _compiler_fn(gm):
                 f, compiler_fn=compiler_fn_with_op_check, compile_fn=False
             )
 
-    def test_non_traceable_autograd_cpp_node(self):
+    @scoped_load_inline
+    def test_non_traceable_autograd_cpp_node(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = false;
@@ -1613,7 +1614,7 @@ def test_non_traceable_autograd_cpp_node(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_non_traceable_autograd_cpp_node",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1634,8 +1635,8 @@ def fn():
         ), compiled_autograd.enable(compiler_fn):
             fn()
 
-    @unittest.skip("Flaky, cache from test ordering affects test. #135369")
-    def test_autograd_cpp_node(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -1662,7 +1663,7 @@ def test_autograd_cpp_node(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1682,7 +1683,8 @@ def fn():
         # compiles for 10 (static) and 100 (dynamic)
         self.check_output_and_recompiles(fn, 2)
 
-    def test_autograd_cpp_node_id(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_id(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -1730,7 +1732,7 @@ def test_autograd_cpp_node_id(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_id",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1773,7 +1775,8 @@ def fn(op):
 
         self.check_output_and_recompiles(different_autograd_fn, 2)
 
-    def test_autograd_cpp_node_saved(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_saved(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -1827,7 +1830,7 @@ def test_autograd_cpp_node_saved(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_saved",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1848,7 +1851,8 @@ def fn():
 
         self.check_output_and_recompiles(fn, 2)
 
-    def test_autograd_cpp_node_saved_dynamic(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_saved_dynamic(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -1884,7 +1888,7 @@ def test_autograd_cpp_node_saved_dynamic(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_saved_dynamic",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1904,7 +1908,8 @@ def fn():
         # compiles for 10 (static) and 100 (dynamic)
         self.check_output_and_recompiles(fn, 2)
 
-    def test_autograd_cpp_node_saved_int(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_saved_int(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -1943,7 +1948,7 @@ def test_autograd_cpp_node_saved_int(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_saved_int",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -1962,7 +1967,8 @@ def fn():
 
         self.check_output_and_recompiles(fn, 1)
 
-    def test_autograd_cpp_node_saved_float(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_saved_float(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -2001,7 +2007,7 @@ def test_autograd_cpp_node_saved_float(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_saved_float",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -2021,7 +2027,8 @@ def fn():
         # compiled autograd and dynamo both support symfloat, but not backend
         self.check_output_and_recompiles(fn, [1, 3])
 
-    def test_autograd_cpp_node_data_dependent(self):
+    @scoped_load_inline
+    def test_autograd_cpp_node_data_dependent(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -2092,7 +2099,7 @@ def test_autograd_cpp_node_data_dependent(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_autograd_cpp_node_data_dependent",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
@@ -2332,8 +2339,9 @@ def backward(ctx, gO):
         # Must skip since we do not know if the cpu scalar will be used only in ATen/prim ops.
         self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
 
+    @scoped_load_inline
     @unittest.skipIf(not HAS_CUDA, "requires cuda")
-    def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self):
+    def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -2371,7 +2379,7 @@ def test_cudagraphs_cpu_scalar_used_in_cpp_custom_op(self):
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
+        module = load_inline(
             name="test_cudagraphs_cpu_scalar_used_in_cpp_custom_op",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 3025b954d78f0..f25ca30fd963b 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -69,6 +69,7 @@
     IS_WINDOWS,
     parametrize,
     run_tests,
+    scoped_load_inline,
     set_warn_always_context,
     skipIfMps,
     skipIfNoLapack,
@@ -85,7 +86,6 @@
     CheckpointPolicy,
     create_selective_checkpoint_contexts,
 )
-from torch.utils.cpp_extension import load_inline
 from torch.utils.flop_counter import FlopCounterMode
 
 
@@ -9854,7 +9854,8 @@ def test_scalar_grad_mixed_device(self):
         out = x * y
         out.sum().backward()
 
-    def test_multi_grad_all_hooks(self):
+    @scoped_load_inline
+    def test_multi_grad_all_hooks(self, load_inline):
         t1 = torch.rand(2, requires_grad=True)
         t2 = torch.rand(2, requires_grad=True)
         t3 = torch.rand(2, requires_grad=True)
@@ -9899,19 +9900,19 @@ def backward(ctx, gO):
   return CustomOpAutogradFunction::apply(x);
 }
 
-TORCH_LIBRARY(test_autograd_cpp_node, m) {
+TORCH_LIBRARY(test_multigrad_all_hooks, m) {
     m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
 }
         """
 
         module = load_inline(
-            name="test_autograd_cpp_node",
+            name="test_multigrad_all_hooks",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
             verbose=True,
         )
 
-        t4 = torch.ops.test_autograd_cpp_node.custom_op_backed_by_autograd_fn(t4)
+        t4 = torch.ops.test_multigrad_all_hooks.custom_op_backed_by_autograd_fn(t4)
 
         res = [None] * 4
         count = [0]
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index cb6ff55f3f471..f0ee8b65be6c5 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -33,6 +33,7 @@
     IS_WINDOWS,
     parametrize,
     run_tests,
+    scoped_load_inline,
     skipIfTorchDynamo,
     subtest,
     TestCase,
@@ -2088,7 +2089,8 @@ def test_impl_device_invalid(self):
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu, cuda"):
             torch.library.impl("blah::blah", "somethingsomething")
 
-    def test_autograd_function_backed_op(self):
+    @scoped_load_inline
+    def test_autograd_function_backed_op(self, load_inline):
         cpp_source = """
 struct CustomOpAutogradFunction : public torch::autograd::Function<CustomOpAutogradFunction> {
   static constexpr bool is_traceable = true;
@@ -2110,13 +2112,13 @@ def test_autograd_function_backed_op(self):
   return CustomOpAutogradFunction::apply(x);
 }
 
-TORCH_LIBRARY(mylib, m) {
+TORCH_LIBRARY(test_autograd_function_backed_op, m) {
     m.def("custom_op_backed_by_autograd_fn", custom_op_backed_by_autograd_fn);
 }
         """
 
-        module = torch.utils.cpp_extension.load_inline(
-            name="mylib",
+        module = load_inline(
+            name="test_autograd_function_backed_op",
             cpp_sources=cpp_source,
             functions="custom_op_backed_by_autograd_fn",
             verbose=True,
@@ -2124,7 +2126,11 @@ def test_autograd_function_backed_op(self):
 
         x = torch.ones(2, 2, requires_grad=True)
         temp = x.clone().detach()
-        out = torch.ops.mylib.custom_op_backed_by_autograd_fn(x)
+        out = (
+            torch.ops.test_autograd_function_backed_op.custom_op_backed_by_autograd_fn(
+                x
+            )
+        )
         loss = out.sum()
         loss.backward()
         self.assertEqual(x.grad, temp)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6d1f37c950318..764a2fc6f3c01 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -98,6 +98,7 @@
 from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.utils._import_utils import _check_module_exists
 import torch.utils._pytree as pytree
+from torch.utils import cpp_extension
 try:
     import pytest
     has_pytest = True
@@ -5379,7 +5380,7 @@ def remove_cpp_extensions_build_root():
     """
     Removes the default root folder under which extensions are built.
     """
-    default_build_root = torch.utils.cpp_extension.get_default_build_root()
+    default_build_root = cpp_extension.get_default_build_root()
     if os.path.exists(default_build_root):
         if IS_WINDOWS:
             # rmtree returns permission error: [WinError 5] Access is denied
@@ -5387,3 +5388,24 @@ def remove_cpp_extensions_build_root():
             subprocess.run(["rm", "-rf", default_build_root], stdout=subprocess.PIPE)
         else:
             shutil.rmtree(default_build_root, ignore_errors=True)
+
+# Decorator to provide a helper to load inline extensions to a temp directory
+def scoped_load_inline(func):
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        def load_inline(*args, **kwargs):
+            if IS_WINDOWS:
+                # TODO(xmfan): even using TemporaryDirectoryName will result in permission error
+                return cpp_extension.load_inline(*args, **kwargs)
+
+            assert "build_directory" not in kwargs
+            with TemporaryDirectoryName() as temp_dir_name:
+                if kwargs.get("verbose", False):
+                    print(f'Using temporary extension directory {temp_dir_name}...', file=sys.stderr)
+                kwargs["build_directory"] = temp_dir_name
+                return cpp_extension.load_inline(*args, **kwargs)
+
+        return func(*args, load_inline=load_inline, **kwargs)
+
+    return wrapper

From a3aca24ae5f2e425b69df29dbc3f7c77ab163dd8 Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Sat, 26 Oct 2024 10:10:14 +0000
Subject: [PATCH 110/161] [AOTI] add C shim for QLinearPointwise (#138439)

This PR adds C shim for `QLinearPointwisePT2E` and `QLinearPointwiseBinaryPT2E`.

The below changes are needed:
- We moved the qlinear API out of the anonymous namespace since we need to call it in the shim layer.

- We fixed the code which generated the `inputs` and `constant_args` so that we can directly leverage the `codegen` of the parent class.

- `x_scale` and `x_zp` are ensured to be tensor during the lowering stage, thus we can remove the code which handles whether they're tensor or not.
  https://github.com/pytorch/pytorch/blob/fb0da323773e47bc9df5cf3a0628c9a9d7baea73/torch/_inductor/mkldnn_lowerings.py#L492-L496

  https://github.com/pytorch/pytorch/blob/fb0da323773e47bc9df5cf3a0628c9a9d7baea73/torch/_inductor/mkldnn_lowerings.py#L499-L503

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138439
Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/desertfire
---
 .../src/ATen/native/quantized/cpu/qlinear.cpp | 137 +++----
 aten/src/ATen/native/quantized/cpu/qlinear.h  |  47 +++
 test/inductor/test_cpu_cpp_wrapper.py         |   6 +-
 test/inductor/test_mkldnn_pattern_matcher.py  |   6 +-
 torch/_inductor/mkldnn_ir.py                  | 339 +++---------------
 .../csrc/inductor/aoti_torch/c/shim_mkldnn.h  |  40 +++
 .../csrc/inductor/aoti_torch/shim_mkldnn.cpp  |  94 +++++
 7 files changed, 306 insertions(+), 363 deletions(-)
 create mode 100644 aten/src/ATen/native/quantized/cpu/qlinear.h

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 26fe9cd2ac4cc..9f2cf186e03b3 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/quantized/cpu/XnnpackUtils.h>
 #include <ATen/native/quantized/cpu/OnednnUtils.h>
 #include <ATen/native/quantized/cpu/QuantUtils.h>
+#include <ATen/native/quantized/cpu/qlinear.h>
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>
@@ -1103,6 +1104,73 @@ static at::Tensor linear_int8_with_onednn_weight(
 
 namespace at {
 namespace native {
+
+  Tensor QLinearOnednn::run_pointwise_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      c10::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      c10::string_view post_op_algorithm) {
+#if AT_MKLDNN_ENABLED()
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
+        "onednn int8 linear: act scale/zp size should be 1");
+    static std::optional<at::Tensor> other = std::nullopt;
+    static const c10::string_view binary_post_op = "none";
+    return linear_int8_with_onednn_weight(
+        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        onednn_weight, weight_scales, weight_zero_points,
+        bias, output_scale, output_zero_point, output_dtype,
+        other, /*other scale*/1.0, /*other zp*/0,
+        binary_post_op, /*binary alpha*/1.0,
+        post_op_name, post_op_args, post_op_algorithm
+    );
+#endif
+    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
+  }
+
+  Tensor QLinearOnednn::run_pointwise_binary_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other, // extra input for binary post-op
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      c10::string_view binary_post_op, // e.g. "none", "sum", "add"
+      double binary_alpha,
+      c10::string_view unary_post_op, // e.g. "none", "relu"
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      c10::string_view unary_post_op_algorithm) {
+#if AT_MKLDNN_ENABLED()
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
+        "onednn int8 linear: act scale/zp size should be 1");
+    return linear_int8_with_onednn_weight(
+        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        onednn_weight, weight_scales, weight_zero_points,
+        bias, output_scale, output_zero_point, output_dtype,
+        other, other_scale, other_zero_point,
+        binary_post_op, binary_alpha,
+        unary_post_op, unary_post_op_args, unary_post_op_algorithm
+    );
+#endif
+    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
+  }
+
+
 namespace {
 
 template <bool ReluFused>
@@ -1220,37 +1288,6 @@ class QLinearOnednn final {
     TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
   }
 
-  static Tensor run_pointwise_tensor(
-      Tensor act, // int8 CPU tensor, not QTensor
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor onednn_weight, // int8 tensor from MkldnnCPU
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      c10::string_view post_op_name,
-      torch::List<std::optional<at::Scalar>> post_op_args,
-      c10::string_view post_op_algorithm) {
-#if AT_MKLDNN_ENABLED()
-    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
-        "onednn int8 linear: act scale/zp size should be 1");
-    static std::optional<at::Tensor> other = std::nullopt;
-    static const c10::string_view binary_post_op = "none";
-    return linear_int8_with_onednn_weight(
-        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
-        onednn_weight, weight_scales, weight_zero_points,
-        bias, output_scale, output_zero_point, output_dtype,
-        other, /*other scale*/1.0, /*other zp*/0,
-        binary_post_op, /*binary alpha*/1.0,
-        post_op_name, post_op_args, post_op_algorithm
-    );
-#endif
-    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
-  }
-
   static Tensor run_pointwise_binary(
       Tensor act, // int8 CPU tensor, not QTensor
       double act_scale,
@@ -1279,40 +1316,6 @@ class QLinearOnednn final {
         binary_post_op, binary_alpha,
         unary_post_op, unary_post_op_args, unary_post_op_algorithm
     );
-#endif
-    TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
-  }
-
-  static Tensor run_pointwise_binary_tensor(
-      Tensor act, // int8 CPU tensor, not QTensor
-      Tensor act_scale,
-      Tensor act_zero_point,
-      Tensor onednn_weight, // int8 tensor from MkldnnCPU
-      Tensor weight_scales,
-      Tensor weight_zero_points,
-      std::optional<at::Tensor> other, // extra input for binary post-op
-      std::optional<Tensor> bias,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double other_scale,
-      int64_t other_zero_point,
-      c10::string_view binary_post_op, // e.g. "none", "sum", "add"
-      double binary_alpha,
-      c10::string_view unary_post_op, // e.g. "none", "relu"
-      torch::List<std::optional<at::Scalar>> unary_post_op_args,
-      c10::string_view unary_post_op_algorithm) {
-#if AT_MKLDNN_ENABLED()
-    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
-        "onednn int8 linear: act scale/zp size should be 1");
-    return linear_int8_with_onednn_weight(
-        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
-        onednn_weight, weight_scales, weight_zero_points,
-        bias, output_scale, output_zero_point, output_dtype,
-        other, other_scale, other_zero_point,
-        binary_post_op, binary_alpha,
-        unary_post_op, unary_post_op_args, unary_post_op_algorithm
-    );
 #endif
     TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)");
   }
@@ -1340,11 +1343,11 @@ TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
       TORCH_FN(QLinearOnednn::run_pointwise));
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
-      TORCH_FN(QLinearOnednn::run_pointwise_tensor));
+      TORCH_FN(at::native::QLinearOnednn::run_pointwise_tensor));
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
       TORCH_FN(QLinearOnednn::run_pointwise_binary));
   m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
-      TORCH_FN(QLinearOnednn::run_pointwise_binary_tensor));
+      TORCH_FN(at::native::QLinearOnednn::run_pointwise_binary_tensor));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.h b/aten/src/ATen/native/quantized/cpu/qlinear.h
new file mode 100644
index 0000000000000..bc1db01a741c2
--- /dev/null
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/Config.h>
+
+namespace at {
+namespace native {
+
+class QLinearOnednn final {
+ public:
+  C10_API static Tensor run_pointwise_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      c10::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      c10::string_view post_op_algorithm);
+
+C10_API static Tensor run_pointwise_binary_tensor(
+      Tensor act, // int8 CPU tensor, not QTensor
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor onednn_weight, // int8 tensor from MkldnnCPU
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other, // extra input for binary post-op
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      c10::string_view binary_post_op, // e.g. "none", "sum", "add"
+      double binary_alpha,
+      c10::string_view unary_post_op, // e.g. "none", "relu"
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      c10::string_view unary_post_op_algorithm);
+};
+
+} // namespace native
+} // namespace at
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index ef8c9eebf4e86..a1069678eff2a 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -285,7 +285,11 @@ class BaseTest(NamedTuple):
             test_mkldnn_pattern_matcher.TestDynamicPatternMatcher(),
             condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
             func_inputs=[
-                None,
+                [
+                    "torch.ops.onednn.qconv2d_pointwise",
+                    "torch.ops.quantized.max_pool2d",
+                    "aoti_torch_cpu__qlinear_pointwise_tensor",
+                ]
             ],
         ),
         *[
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 133379932840e..d0e13ab26c4aa 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -1896,12 +1896,12 @@ def matcher_check_fn():
                     mod,
                     (v,),
                     [
-                        "torch.ops.onednn.qlinear_pointwise.tensor",
-                        "torch.ops.onednn.qlinear_pointwise.binary",
+                        "aoti_torch_cpu__qlinear_pointwise_tensor",
+                        "aoti_torch_cpu__qlinear_pointwise_binary_tensor",
                     ],
                     [],
                     check_quantization=True,
-                    num_include_ops=[4, 4],
+                    num_include_ops=[2, 2],
                 )
             else:
                 # For python wrapper
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index 948c8ce26cafd..f0aef30437015 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -187,6 +187,9 @@ def _prepare_linear_fusion_create(
     x: "TensorBox",
     weight: "TensorBox",
     bias: "TensorBox",
+    quantize_args: Optional[List["TensorBox"]] = None,
+    other: Optional["TensorBox"] = None,
+    binary_sum: bool = False,
 ):
     """
     This function is a helper function to prepare inputs, layout and constant args
@@ -208,7 +211,22 @@ def _prepare_linear_fusion_create(
 
     x = cls.require_stride_order(x, req_stride_order)
     assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
-    inputs = [x, weight]
+    inputs = [x]
+
+    if quantize_args is not None:
+        x_scale, x_zero_point, w_scale, w_zero_point = quantize_args
+        x_scale.realize()
+        x_zero_point.realize()
+        w_scale.realize()
+        w_zero_point.realize()
+        inputs = inputs + [x_scale, x_zero_point] + [weight] + [w_scale, w_zero_point]
+    else:
+        inputs += [weight]
+
+    if other is not None:
+        if binary_sum:
+            other = cls.require_stride_order(other, req_stride_order)
+        inputs = inputs + [other]
 
     output_stride = FlexibleLayout.contiguous_strides(output_size)
     kernel_layout = FixedLayout(
@@ -223,7 +241,7 @@ def _prepare_linear_fusion_create(
         inputs.append(bias)
     else:
         constant_args.insert(0, bias)
-    return inputs, constant_args, kernel_layout, req_stride_order
+    return inputs, constant_args, kernel_layout, req_stride_order, other
 
 
 def _create_output_node(packed):
@@ -1253,7 +1271,6 @@ def __init__(
         inputs,
         constant_args=(),
         has_bias=True,
-        x_scale_zp_are_tensors=False,
     ) -> None:
         """
         if bias is not None
@@ -1266,23 +1283,15 @@ def __init__(
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
         self.has_bias = has_bias
-        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
-            op_overload=(
-                torch.ops.onednn.qlinear_pointwise.tensor
-                if x_scale_zp_are_tensors
-                else torch.ops.onednn.qlinear_pointwise.default
-            ),
-        )
-        x_scale_type_str, x_zp_type_str = (
-            ("at::Tensor", "at::Tensor")
-            if x_scale_zp_are_tensors
-            else ("double", "int64_t")
+            op_overload=(torch.ops.onednn.qlinear_pointwise.tensor),
+            cpp_kernel_name=("aoti_torch_cpu__qlinear_pointwise_tensor"),
         )
+        x_scale_type_str, x_zp_type_str = ("at::Tensor", "at::Tensor")
         self.cpp_op_schema = f"""
             at::Tensor(
                 at::Tensor act,
@@ -1300,104 +1309,9 @@ def __init__(
                 c10::string_view post_op_algorithm)"""
 
     def codegen(self, wrapper):
-        # Parser the inputs and constant
-        # The raw_args setup can be skipped if there is a C shim implementation
-        args = [x.codegen_reference() for x in self.inputs]
-        const_args = []
-        const_args.extend(self.codegen_const_args())
-
-        x = args[0]
-        x_raw = self.inputs[0]
-        packed_weight = args[1]
-        packed_weight_raw = self.inputs[1]
-        bias = args[2] if self.has_bias else const_args[0]
-        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[0]
-        w_scale, w_zp = args[-2], args[-1]
-        w_scale_raw, w_zp_raw = self.inputs[-2], self.inputs[-1]
-        if self.x_scale_zp_are_tensors:
-            assert len(args) >= 4
-            x_scale, x_zp = args[-4], args[-3]
-            x_scale_raw, x_zp_raw = self.inputs[-4], self.inputs[-3]
-            (
-                o_scale,
-                o_zp,
-                output_dtype,
-                unary_attr,
-                unary_scalars,
-                unary_algorithm,
-            ) = const_args[-6:]
-            (
-                o_scale_raw,
-                o_zp_raw,
-                output_dtype_raw,
-                unary_attr_raw,
-                unary_scalars_raw,
-                unary_algorithm_raw,
-            ) = self.constant_args[-6:]
-        else:
-            assert len(const_args) >= 8
-            (
-                x_scale,
-                x_zp,
-                o_scale,
-                o_zp,
-                output_dtype,
-                unary_attr,
-                unary_scalars,
-                unary_algorithm,
-            ) = const_args[-8:]
-            (
-                x_scale_raw,
-                x_zp_raw,
-                o_scale_raw,
-                o_zp_raw,
-                output_dtype_raw,
-                unary_attr_raw,
-                unary_scalars_raw,
-                unary_algorithm_raw,
-            ) = self.constant_args[-8:]
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+        super().codegen(wrapper)
 
-        codegen_args = (
-            x,
-            x_scale,
-            x_zp,
-            packed_weight,
-            w_scale,
-            w_zp,
-            bias,
-            o_scale,
-            o_zp,
-            output_dtype,
-            unary_attr,
-            unary_scalars,
-            unary_algorithm,
-        )
-        raw_args = (
-            x_raw,
-            x_scale_raw,
-            x_zp_raw,
-            packed_weight_raw,
-            w_scale_raw,
-            w_zp_raw,
-            bias_raw,
-            o_scale_raw,
-            o_zp_raw,
-            output_dtype_raw,
-            unary_attr_raw,
-            unary_scalars_raw,
-            unary_algorithm_raw,
-        )
-        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
-            self.get_name(),
-            self.python_kernel_name,
-            self.cpp_kernel_name,
-            codegen_args,
-            self.cpp_op_schema,
-            self.cpp_kernel_key,
-            self.cpp_kernel_overload_name,
-            self.op_overload,
-            raw_args,
-        )
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
 
@@ -1405,8 +1319,8 @@ def codegen(self, wrapper):
     def create(
         cls,
         qx: "TensorBox",
-        x_scale: float,
-        x_zero_point: int,
+        x_scale: "TensorBox",
+        x_zero_point: "TensorBox",
         qw: "TensorBox",  # packed_weight
         w_scale: "TensorBox",
         w_zero_point: "TensorBox",
@@ -1418,25 +1332,14 @@ def create(
         post_op_args,
         post_op_algorithm,
     ):
-        (inputs, constant_args, kernel_layout, _) = _prepare_linear_fusion_create(
+        (inputs, constant_args, kernel_layout, _, _) = _prepare_linear_fusion_create(
             cls,
             qx,
             qw,
             bias,
+            [x_scale, x_zero_point, w_scale, w_zero_point],
         )
 
-        if isinstance(x_scale, TensorBox) and isinstance(x_zero_point, TensorBox):
-            x_scale.realize()
-            x_zero_point.realize()
-            inputs = inputs + [x_scale, x_zero_point]
-            x_scale_zp_are_tensors = True
-        else:
-            assert isinstance(x_scale, float) and isinstance(x_zero_point, int)
-            constant_args = constant_args + [x_scale, x_zero_point]
-            x_scale_zp_are_tensors = False
-        w_scale.realize()
-        w_zero_point.realize()
-        inputs = inputs + [w_scale, w_zero_point]
         constant_args = constant_args + [
             output_scale,
             output_zero_point,
@@ -1457,7 +1360,6 @@ def create(
             inputs=inputs,
             constant_args=constant_args,
             has_bias=(bias is not None),
-            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
         )
 
 
@@ -1468,36 +1370,28 @@ def __init__(
         inputs,
         constant_args=(),
         has_bias=True,
-        x_scale_zp_are_tensors=False,
     ) -> None:
         """
         if bias is not None
-            - inputs = [x, w, b, weight_scale, weight_zp, x2]
-            - const_args is: [x_scale, x_zp, o_scale, o_zp,
+            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias]
+            - const_args is: [o_scale, o_zp,
               fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
         else
-            - inputs = [x, w, weight_scale, weight_zp, x2]
-            - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
+            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2]
+            - const_args is: [bias, o_scale, o_zp,
               fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
         """
         self.has_bias = has_bias
-        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
+        self.idx_for_inplace_sum = 6
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
-            op_overload=(
-                torch.ops.onednn.qlinear_pointwise.binary_tensor
-                if x_scale_zp_are_tensors
-                else torch.ops.onednn.qlinear_pointwise.binary
-            ),
-        )
-        x_scale_type_str, x_zp_type_str = (
-            ("at::Tensor", "at::Tensor")
-            if x_scale_zp_are_tensors
-            else ("double", "int64_t")
+            op_overload=(torch.ops.onednn.qlinear_pointwise.binary_tensor),
+            cpp_kernel_name="aoti_torch_cpu__qlinear_pointwise_binary_tensor",
         )
+        x_scale_type_str, x_zp_type_str = ("at::Tensor", "at::Tensor")
         self.cpp_op_schema = f"""
             at::Tensor(
                 at::Tensor act,
@@ -1520,141 +1414,15 @@ def __init__(
                 c10::string_view unary_post_op_algorithm)"""
 
     def codegen(self, wrapper):
-        # Parser the inputs and constant
-        # The raw_args setup can be skipped if there is a C shim implementation
-        args = [x.codegen_reference() for x in self.inputs]
-        const_args = []
-        const_args.extend(self.codegen_const_args())
-
-        x = args[0]
-        x_raw = self.inputs[0]
-        packed_weight = args[1]
-        packed_weight_raw = self.inputs[1]
-        bias = args[2] if self.has_bias else const_args[0]
-        bias_raw = self.inputs[2] if self.has_bias else self.constant_args[0]
-        w_scale, w_zp, other = args[-3], args[-2], args[-1]
-        w_scale_raw, w_zp_raw, other_raw = (
-            self.inputs[-3],
-            self.inputs[-2],
-            self.inputs[-1],
-        )
-        if self.x_scale_zp_are_tensors:
-            assert len(args) >= 5
-            x_scale, x_zp = args[-5], args[-4]
-            x_scale_raw, x_zp_raw = self.inputs[-5], self.inputs[-4]
-            (
-                o_scale,
-                o_zp,
-                output_dtype,
-                other_scale,
-                other_zp,
-                binary_attr,
-                alpha,
-                unary_attr,
-                unary_scalars,
-                unary_algorithm,
-            ) = const_args[-10:]
-            (
-                o_scale_raw,
-                o_zp_raw,
-                output_dtype_raw,
-                other_scale_raw,
-                other_zp_raw,
-                binary_attr_raw,
-                alpha_raw,
-                unary_attr_raw,
-                unary_scalars_raw,
-                unary_algorithm_raw,
-            ) = self.constant_args[-10:]
-        else:
-            assert len(const_args) >= 8
-            (
-                x_scale,
-                x_zp,
-                o_scale,
-                o_zp,
-                output_dtype,
-                other_scale,
-                other_zp,
-                binary_attr,
-                alpha,
-                unary_attr,
-                unary_scalars,
-                unary_algorithm,
-            ) = const_args[-12:]
-            (
-                x_scale_raw,
-                x_zp_raw,
-                o_scale_raw,
-                o_zp_raw,
-                output_dtype_raw,
-                other_scale_raw,
-                other_zp_raw,
-                binary_attr_raw,
-                alpha_raw,
-                unary_attr_raw,
-                unary_scalars_raw,
-                unary_algorithm_raw,
-            ) = self.constant_args[-12:]
-
-        codegen_args = (
-            x,
-            x_scale,
-            x_zp,
-            packed_weight,
-            w_scale,
-            w_zp,
-            other,
-            bias,
-            o_scale,
-            o_zp,
-            output_dtype,
-            other_scale,
-            other_zp,
-            binary_attr,
-            alpha,
-            unary_attr,
-            unary_scalars,
-            unary_algorithm,
-        )
-        raw_args = (
-            x_raw,
-            x_scale_raw,
-            x_zp_raw,
-            packed_weight_raw,
-            w_scale_raw,
-            w_zp_raw,
-            other_raw,
-            bias_raw,
-            o_scale_raw,
-            o_zp_raw,
-            output_dtype_raw,
-            other_scale_raw,
-            other_zp_raw,
-            binary_attr_raw,
-            alpha_raw,
-            unary_attr_raw,
-            unary_scalars_raw,
-            unary_algorithm_raw,
-        )
-        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
-            self.get_name(),
-            self.python_kernel_name,
-            self.cpp_kernel_name,
-            codegen_args,
-            self.cpp_op_schema,
-            self.cpp_kernel_key,
-            self.cpp_kernel_overload_name,
-            self.op_overload,
-            raw_args,
-        )
+        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h")
+        super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
 
     def get_mutation_names(self):
         binary_post_op = self.constant_args[-5]
         if binary_post_op == "sum":
-            return [self.inputs[-1].get_name()]
+            return [self.inputs[self.idx_for_inplace_sum].get_name()]
         else:
             return []
 
@@ -1662,8 +1430,8 @@ def get_mutation_names(self):
     def create(
         cls,
         qx: "TensorBox",
-        x_scale: float,
-        x_zero_point: int,
+        x_scale: "TensorBox",
+        x_zero_point: "TensorBox",
         qw: "TensorBox",  # packed_weight
         w_scale: "TensorBox",
         w_zero_point: "TensorBox",
@@ -1685,28 +1453,17 @@ def create(
             constant_args,
             kernel_layout,
             req_stride_order,
+            other,
         ) = _prepare_linear_fusion_create(
             cls,
             qx,
             qw,
             bias,
+            [x_scale, x_zero_point, w_scale, w_zero_point],
+            other,
+            binary_post_op == "sum",
         )
 
-        if isinstance(x_scale, TensorBox) and isinstance(x_zero_point, TensorBox):
-            x_scale.realize()
-            x_zero_point.realize()
-            inputs = inputs + [x_scale, x_zero_point]
-            x_scale_zp_are_tensors = True
-        else:
-            assert isinstance(x_scale, float) and isinstance(x_zero_point, int)
-            constant_args = constant_args + [x_scale, x_zero_point]
-            x_scale_zp_are_tensors = False
-        w_scale.realize()
-        w_zero_point.realize()
-        inputs = inputs + [w_scale, w_zero_point]
-        if binary_post_op == "sum":
-            other = cls.require_stride_order(other, req_stride_order)
-        inputs.append(other)
         constant_args = constant_args + [
             output_scale,
             output_zero_point,
@@ -1727,10 +1484,9 @@ def create(
                 inputs=inputs,
                 constant_args=constant_args,
                 has_bias=(bias is not None),
-                x_scale_zp_are_tensors=x_scale_zp_are_tensors,
             )
             # Return other since it has been inplace changed.
-            return packed.inputs[-1]
+            return packed.inputs[packed.idx_for_inplace_sum]
 
         assert output_dtype is not None
         if output_dtype in [torch.float32, torch.bfloat16]:
@@ -1743,7 +1499,6 @@ def create(
             inputs=inputs,
             constant_args=constant_args,
             has_bias=(bias is not None),
-            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
         )
 
 
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h b/torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h
index 45e7059d212f8..e379d372ffaa0 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_mkldnn.h
@@ -129,6 +129,46 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linear_pointwise_binary(
     const char* attr,
     AtenTensorHandle* ret0);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0);
+
 #if AT_MKL_ENABLED()
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkl_linear(
diff --git a/torch/csrc/inductor/aoti_torch/shim_mkldnn.cpp b/torch/csrc/inductor/aoti_torch/shim_mkldnn.cpp
index 2ee8af6c6af4f..d8912f95127af 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mkldnn.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_mkldnn.cpp
@@ -9,6 +9,7 @@
 #endif
 #include <ATen/native/mkldnn/Conv.h>
 #include <ATen/native/mkldnn/Linear.h>
+#include <ATen/native/quantized/cpu/qlinear.h>
 
 using namespace torch::aot_inductor;
 
@@ -269,6 +270,99 @@ AOTITorchError aoti_torch_cpu__linear_pointwise_binary(
   });
 }
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    auto tmp_result = at::native::QLinearOnednn::run_pointwise_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(B),
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        post_op_name,
+        scalars_list,
+        post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(unary_post_op_args_len_);
+    for (int64_t i = 0; i < unary_post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(unary_post_op_args[i]));
+    }
+
+    auto tmp_result = at::native::QLinearOnednn::run_pointwise_binary_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(other),
+        pointer_to_optional<at::Tensor>(B),
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        other_scale,
+        other_zero_point,
+        binary_post_op,
+        binary_alpha,
+        unary_post_op,
+        scalars_list,
+        unary_post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
 #if AT_MKL_ENABLED()
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkl_linear(

From 043864afdfd2f700cb326c5d30afc84a65cbd32f Mon Sep 17 00:00:00 2001
From: Xu Han <xu.han@outlook.com>
Date: Sat, 26 Oct 2024 12:48:51 +0000
Subject: [PATCH 111/161] enable test_x86inductor_quantizer.py UTs on Windows.
 (#138937)

This UTs are failed months ago, but due to the main branch move forward, some PRs fixed it. Let's turn on them.

Local test passed:
<img width="863" alt="image" src="https://github.com/user-attachments/assets/a2ec160c-cdf1-404d-bc24-2f60faa8d791">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138937
Approved by: https://github.com/jansel
---
 test/quantization/pt2e/test_x86inductor_quantizer.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index 6958b0e277359..8e3b5fa1cb44d 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -1,7 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 import copy
 import itertools
-import sys
 from enum import Enum
 
 import torch
@@ -25,12 +24,7 @@
     skipIfNoX86,
 )
 from torch.testing._internal.common_quantized import override_quantized_engine
-from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfTorchDynamo
-
-
-if IS_WINDOWS and IS_CI:
-    sys.stderr.write("Windows CI still has some issue to be fixed.\n")
-    sys.exit(0)
+from torch.testing._internal.common_utils import skipIfTorchDynamo
 
 
 class NodePosType(Enum):

From 3234b251b30b49b60910a09ee797ef09282f6dd5 Mon Sep 17 00:00:00 2001
From: Adnan Akhundov <aakhundov@fb.com>
Date: Thu, 24 Oct 2024 18:12:07 -0700
Subject: [PATCH 112/161] Fix typos in CreateTMADescriptorVariable (#138877)

This fixes some leftover typos in
CreateTMADescriptorVariable.call_function (and close).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138877
Approved by: https://github.com/davidberard98, https://github.com/zou3519, https://github.com/Skylion007
ghstack dependencies: #138759
---
 torch/_dynamo/variables/functions.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 188a6fe5cb8eb..a178b0a5956c0 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -1227,8 +1227,7 @@ def __init__(
         **kwargs,
     ):
         assert isinstance(data_ptr, variables.DataPtrVariable)
-
-        super().__init__(**kwargs),
+        super().__init__(**kwargs)
         self.data_ptr = data_ptr
         self.dims = dims
         self.block_dims = block_dims
@@ -1260,8 +1259,8 @@ def __init__(
         rank: int,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs),
         assert rank in (1, 2)
+        super().__init__(**kwargs)
         self.rank = rank
 
     def call_function(
@@ -1295,9 +1294,9 @@ def call_function(
             ]
             block_dims = [
                 kwargs["block_dim1"] if "block_dim1" in kwargs else args[3],
-                kwargs["block_dim2"] if "block_dim2" in kwargs else args[4],
+                kwargs["block_dim0"] if "block_dim0" in kwargs else args[4],
             ]
-        element_size = kwargs["ptr"] if "ptr" in kwargs else args[-1]
+        element_size = kwargs["element_size"] if "element_size" in kwargs else args[-1]
 
         return TMADescriptorVariable(
             data_ptr=ptr,

From 14a17ad630153ccbfce2903adba38f3479639bf6 Mon Sep 17 00:00:00 2001
From: Joel Schlosser <jbschlosser@meta.com>
Date: Thu, 24 Oct 2024 16:59:11 -0400
Subject: [PATCH 113/161] Elide calls to is_nested in Dynamo-traced graphs
 (#138841)

Before this PR, calling `is_nested` in-graph would result in graph code like the following:
```python
  class GraphModule(torch.nn.Module):
      def forward(self, L_nt_: "f64[3, s1, 5]", s1: "Sym(s1)"):
          l_nt_ = L_nt_

          # Note this useless line!
          getattr_1 = l_nt_.is_nested;  getattr_1 = None

          add: "f64[3, s1, 5]" = l_nt_ + 2;  l_nt_ = None
          return (add,)
```

This PR follows what is done for `is_sparse` / `is_quantized`: store it onto `TensorVariable` and have `getattr` calls to `is_nested` return the stored value as a constant. This removes the useless line above from the graph. Note that guarding is handled through tensor type check guards, so no need to guard on `is_nested` status.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138841
Approved by: https://github.com/soulitzer
---
 test/dynamo/test_repros.py        |  5 +----
 test/dynamo/test_subclasses.py    | 32 ++++++++++++++++++++++++++++++-
 torch/_dynamo/variables/tensor.py |  8 ++++++++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index fc922b39fb11b..6d488c74fe291 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1700,10 +1700,7 @@ def test_issue175(self):
         opt_model(inp)
         opt_model(inp)
         self.assertEqual(cnt.frame_count, 1)
-
-        self.assertEqual(
-            15 if torch._dynamo.config.inline_inbuilt_nn_modules else 12, cnt.op_count
-        )
+        self.assertEqual(12, cnt.op_count)
 
     def test_exec_import(self):
         def fn1():
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 37f5d01feca18..db3706f90eeab 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -10,7 +10,7 @@
 import torch._functorch.config
 import torch.utils._pytree as pytree
 import torch.utils.checkpoint
-from torch._dynamo.testing import normalize_gm
+from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
 from torch._higher_order_ops.wrap import wrap
 from torch.fx.experimental.symbolic_shapes import (
     DimDynamic,
@@ -1988,6 +1988,36 @@ def append_guard_fail(guards):
 
         return guards_exported, guards_failed
 
+    def test_in_graph_is_nested_call(self):
+        def f(nt):
+            if nt.is_nested:
+                return nt + 2
+            else:
+                return nt + 1
+
+        cnt = CompileCounterWithBackend("aot_eager")
+        compiled_f = torch.compile(f, backend=cnt, fullgraph=True)
+        nt, offsets = self._get_jagged_tensor(((2, 3, 4), 5), None)
+        output = compiled_f(nt)
+        output.backward(torch.ones_like(output))
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(len(cnt.graphs), 1)
+        graph = cnt.graphs[0]
+        norm_graph = normalize_gm(graph.print_readable(print_output=False))
+
+        # expect -no- is_nested calls within the graph
+        self.assertExpectedInline(
+            norm_graph,
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_nt_: "f64[3, s1, 5]", s1: "Sym(s1)"):
+        l_nt_ = L_nt_
+
+        add: "f64[3, s1, 5]" = l_nt_ + 2;  l_nt_ = None
+        return (add,)
+""",  # noqa: B950
+        )
+
     # Note: [What kind of guards are involved in nested tensor compilation]
     #
     # Until we implement UnionFind, dynamic shapes guards are not involved.
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index bb64f30a458ac..8c55e7d5a6cc6 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -103,6 +103,7 @@ class TensorVariable(VariableTracker):
         "requires_grad",
         "is_quantized",
         "is_contiguous",
+        "is_nested",
         "is_sparse",
         "class_type",
         "specialized_value",
@@ -128,6 +129,7 @@ def __init__(
         layout,
         ndim,
         requires_grad,
+        is_nested,
         is_quantized,
         is_sparse,
         class_type,
@@ -149,6 +151,7 @@ def __init__(
         self.requires_grad = requires_grad
         self.is_quantized = is_quantized
         self.is_contiguous = is_contiguous
+        self.is_nested = is_nested
         self.is_sparse = is_sparse
         self.class_type = class_type
         self.has_grad_fn = has_grad_fn
@@ -175,6 +178,7 @@ def specialize(value: torch.Tensor):
             "layout": value.layout,
             "ndim": int(value.ndim),
             "requires_grad": value.requires_grad,
+            "is_nested": value.is_nested,
             "is_quantized": value.is_quantized,
             "is_sparse": value.is_sparse,
             "class_type": type(value),
@@ -320,6 +324,10 @@ def method_attr_is_sparse(self, tx):
         if self.is_sparse is not None:
             return ConstantVariable.create(self.is_sparse)
 
+    def method_attr_is_nested(self, tx):
+        if self.is_nested is not None:
+            return ConstantVariable.create(self.is_nested)
+
     def method_attr_data(self, tx):
         return variables.TorchInGraphFunctionVariable(
             torch._C._autograd._get_data_attr

From f1a677cba5ef7514f2cf303753d3117528867a33 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 25 Oct 2024 12:23:24 -0700
Subject: [PATCH 114/161] In Inductor, be willing to generate deferred runtime
 asserts when unbacked (#138804)

Title + we avoid calling defer_assert when we statically know the guard results.
timing for pnasnet5large

```
TIMING: code_gen:21.79672 inductor_compile:39.57726 backend_compile:65.30649 entire_frame_compile:95.22052 total_wall_time:95.22052
```
matches with out the diff
```
TIMING: code_gen:21.89314 inductor_compile:39.72298 backend_compile:65.38539 entire_frame_compile:95.0854 total_wall_time:95.0854
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138804
Approved by: https://github.com/ezyang
---
 test/inductor/test_aot_inductor.py       | 90 ++++++++++++++++++++++++
 torch/_inductor/sizevars.py              | 27 +++++--
 torch/fx/experimental/symbolic_shapes.py |  7 +-
 3 files changed, 115 insertions(+), 9 deletions(-)

diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 14c48a4e7e94e..3f346a2098140 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -15,6 +15,7 @@
 import torch._inductor
 import torch._inductor.config
 import torch.nn as nn
+import torch.nn.functional as F
 from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
@@ -3648,6 +3649,95 @@ def forward(self, x):
         example_inputs = (torch.randn(8, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+    def test_tile_positional_embedding(self):
+        class TilePositionalEmbedding(nn.Module):
+            """
+            Positional embedding for tiles, different for every tile, same for every token within a tile.
+            Notice that tile is different from patch (token). For details, please check the documentation of
+            :class:`torchtune.modules.vision_transformer.VisionTransformer`.
+            Args:
+                max_num_tiles (int): The maximum number of tiles an image can be divided into.
+                embed_dim (int): The dimensionality of each tile embedding.
+            """
+
+            def __init__(
+                self,
+                max_num_tiles: int,
+                embed_dim: int,
+            ):
+                super().__init__()
+                self.max_num_tiles = max_num_tiles
+                self.embed_dim = embed_dim
+
+                scale = embed_dim**-0.5
+                self.embedding = nn.Parameter(
+                    scale * torch.randn(max_num_tiles, max_num_tiles, 1, embed_dim)
+                )
+                self.gate = nn.Parameter(torch.zeros(1))
+
+            def forward(
+                self, x: torch.Tensor, aspect_ratio: torch.Tensor
+            ) -> torch.Tensor:
+                """
+                args:
+                    x (torch.Tensor): torch.Tensor with shape (bsz * n_imgs, n_tiles, n_tokens, embed_dim).
+                    aspect_ratio (torch.Tensor): torch.Tensor with shape (bsz * n_imgs, 2),
+                        representing the aspect ratio of the image before tile-cropping, e.g. (2,1).
+                returns:
+                    torch.Tensor: The input tensor with added positional embeddings.
+                """
+                bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape
+                torch._check(n_tiles <= self.max_num_tiles)
+
+                for batch_idx, (n_tiles_h, n_tiles_w) in enumerate(aspect_ratio):
+                    # When we batch images, all are padded to the same amount of tiles.
+                    # The aspect_ratio lets us know the non padded tiles for each image.
+                    # We only add positional encoding to those.
+                    n_tiles_h = n_tiles_h.item()
+                    n_tiles_w = n_tiles_w.item()
+
+                    n_non_padded_tiles = int(n_tiles_h * n_tiles_w)
+
+                    # We get only the positional encoding for non padded tiles,
+                    # i.e. n_tiles_h, n_tiles_w.
+                    torch._check_is_size(n_tiles_h)
+                    torch._check_is_size(n_tiles_w)
+                    torch._check(n_tiles_h > 0)
+                    torch._check(n_tiles_w > 0)
+                    torch._check(n_tiles_h <= self.max_num_tiles)
+                    torch._check(n_tiles_w <= self.max_num_tiles)
+                    padded_embedding = F.pad(self.embedding, (0, 0, 0, 0, 0, 1, 0, 1))
+                    # pos_embed = padded_embedding[:n_tiles_h, :n_tiles_w, :, :]
+                    pos_embed = padded_embedding.narrow(0, 0, n_tiles_h).narrow(
+                        1, 0, n_tiles_w
+                    )
+
+                    # Add pos encoding to the non padded tiles.
+                    pos_embed = pos_embed.clone()
+                    pos_embed = pos_embed.view(n_non_padded_tiles, 1, self.embed_dim)
+
+                    x = F.pad(x, (0, 0, 0, 0, 0, 1, 0, 0))
+                    torch._check_is_size(n_non_padded_tiles)
+                    torch._check(n_non_padded_tiles < x.size(1))
+                    # x[batch_idx, :n_non_padded_tiles, :, :] += pos_embed
+                    updating = x.narrow(0, batch_idx, batch_idx + 1).narrow(
+                        1, 0, n_non_padded_tiles
+                    )
+                    # updating += pos_embed * self.gate.tanh()
+                    updating.add_(pos_embed * self.gate.tanh())
+                    # x = x[:, :n_tiles, :, :]
+                    x = x.narrow(1, 0, n_tiles)
+
+                return x
+
+        x = torch.ones(1, 4, 1600, 1280, device=self.device)
+        aspect_ratio = torch.tensor([[2, 2]], device=self.device)
+
+        self.check_model(
+            TilePositionalEmbedding(4, 1280),
+            (x, aspect_ratio),
+        )
+
     @dynamo_config.patch({"capture_scalar_outputs": True})
     def test_sym_i64_input_codegen(self):
         if self.device != "cuda":
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 44fe34895a8cd..8775036cf1059 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -247,9 +247,11 @@ def _simplify_loops_impl(
             # for which "strides" don't make sense so we ignore them here.
             # NOTE: These expressions may still block merging dims in the sound
             # substitution test performed in can_merge_dims.
-            self.stride_vars(x, index_vars)
-            if isinstance(x, sympy.Expr)
-            else [0] * len(index_vars)
+            (
+                self.stride_vars(x, index_vars)
+                if isinstance(x, sympy.Expr)
+                else [0] * len(index_vars)
+            )
             for x in index_formulas
         ]
         assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
@@ -415,14 +417,29 @@ def guard_equals(self, left: Expr, right: Expr) -> Expr:
             left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         if isinstance(right, Expr):
             right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
-        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
+
+        expr = sympy.Eq(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return left
+
+        assert self.shape_env.defer_runtime_assert(expr, "guard_equals")
         return left
 
     def guard_leq(self, left: Expr, right: Expr) -> None:
         return self.guard_lt(left, right + 1)
 
     def guard_lt(self, left: Expr, right: Expr) -> None:
-        assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
+        expr = sympy.Lt(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return
+
+        assert self.shape_env.defer_runtime_assert(expr, "guard_lt")
 
     def guarded_order(self, seq):
         """
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index d5503ba25acb3..83c651e29c585 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -6289,6 +6289,7 @@ def cleanup(self) -> None:
             for ra in ras:
                 ra.stack.cleanup()
 
+    @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
     def defer_runtime_assert(
         self, orig_expr: SympyBoolean, msg: str, fx_node: Optional[torch.fx.Node] = None
@@ -6326,7 +6327,6 @@ def defer_runtime_assert(
         # NB: Don't use new_expr as expr; it could contain gunk like shape0
         # which we don't want to guard on
 
-        # OK, we're definitely doing a runtime assert now
         if (
             self._translation_validation_enabled
             and fx_node is not None
@@ -6340,10 +6340,9 @@ def defer_runtime_assert(
         if not self._suppress_guards_tls():
             # If you're here because of this assert, read Note [Backwards runtime asserts]
             # in torch/_inductor/graph.py
-            assert not self.runtime_asserts_frozen, expr
-
+            if self.runtime_asserts_frozen:
+                log.warning("runtime_asserts_frozen but then got %s", expr)
             self._check_frozen(expr, sympy.true)
-
             # eliminate symbols on equality tests / refine ranges
             if isinstance(expr, sympy.Rel):
                 self._maybe_guard_rel(expr)

From eb6c7b93a731f35948e90c744c50cffdc81b7992 Mon Sep 17 00:00:00 2001
From: James Wu <jjwu@meta.com>
Date: Fri, 25 Oct 2024 11:20:01 -0700
Subject: [PATCH 115/161] Log AOTAutogradCache state to PT2 Compile Events
 (#138604)

Same as previous diff for inductor, but for autograd instead

Differential Revision: [D64765199](https://our.internmc.facebook.com/intern/diff/D64765199/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138604
Approved by: https://github.com/oulgen
---
 .../_aot_autograd/autograd_cache.py           | 24 +++++++++++++++++++
 torch/_functorch/aot_autograd.py              | 19 ++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 9512e6561a438..e5bc8bc95d3c6 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -33,6 +33,7 @@
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.utils import should_use_remote_fx_graph_cache
 from torch._logging import LazyString
+from torch._utils_internal import log_cache_bypass
 
 from .runtime_wrappers import (
     AOTDispatchAutograd,
@@ -439,11 +440,14 @@ def wrap_post_compile(
 
         compiled_fw_func = self.compiled_fw.load(args, fx_config)
         compiled_bw_func = None
+        chromium_log = get_chromium_event_logger()
         if self.compiled_bw is not None:
             compiled_bw_func = self.compiled_bw.load(args, fx_config)
             needs_autograd = True
+            chromium_log.add_event_data("backend_compile", dispatch_mode="autograd")
         else:
             needs_autograd = False
+            chromium_log.add_event_data("backend_compile", dispatch_mode="inference")
 
         # Wrap the forward function in post compile wrappers
         compiled_fw_func = AOTDispatchSubclassWrapper(
@@ -455,6 +459,11 @@ def wrap_post_compile(
             compiled_fw_func, aot_config, runtime_metadata=self.runtime_metadata
         )
 
+        req_subclass_dispatch = self.maybe_subclass_meta is not None
+        chromium_log.add_event_data(
+            "backend_compile", requires_subclass_dispatch=req_subclass_dispatch
+        )
+
         # In autograd case, functionalizedRngWrapper should not modify outs
         return_new_outs = not needs_autograd
         compiled_fw_func = FunctionalizedRngRuntimeWrapper(
@@ -619,6 +628,9 @@ def load(
             counters["aot_autograd"]["autograd_cache_bypass"] += 1
             cache_state = "bypass"
             cache_event_time = time.time_ns()
+            cache_info["cache_bypass_reason"] = str(e)
+            if remote:
+                log_cache_bypass("bypass_aot_autograd", str(e))
             if config.strict_autograd_cache:
                 raise e
         if compiled_fn is None:
@@ -638,6 +650,18 @@ def load(
         chromium_log.log_instant_event(
             f"autograd_cache_{cache_state}", cache_event_time, metadata=cache_info
         )
+
+        chromium_log.add_event_data(
+            "backend_compile",
+            cache_state=cache_state,
+            cache_event_time=cache_event_time,
+            key=cache_info.get("key"),
+            components=cache_info.get("components"),
+            cache_bypass_reason=cache_info.get("cache_bypass_reason"),
+            remote_cache_enabled=remote,
+            local_cache_enabled=local,
+        )
+
         torch._logging.trace_structured(
             "artifact",
             metadata_fn=lambda: {
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index add969035f0bc..85120254b2e66 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -15,7 +15,11 @@
 from torch._decomp.decompositions_for_rng import PhiloxStateTracker, rng_decompositions
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo import compiled_autograd
-from torch._dynamo.utils import dynamo_timed, preserve_rng_state
+from torch._dynamo.utils import (
+    dynamo_timed,
+    get_chromium_event_logger,
+    preserve_rng_state,
+)
 from torch._guards import detect_fake_mode
 from torch._inductor.utils import BoxedBool
 from torch._subclasses import FakeTensor, FakeTensorMode
@@ -581,6 +585,13 @@ def _create_aot_dispatcher_function(
         enable_python_dispatcher() if shape_env is not None else nullcontext()
     )
 
+    def try_record_chromium_data(**kwargs):
+        # `backend_compile` only exists as an event if we are compiling with dynamo
+        # In some unit tests we don't use dynamo, so we ignore those cases
+        chromium_log = get_chromium_event_logger()
+        if "backend_compile" in chromium_log.get_stack():
+            chromium_log.add_event_data("backend_compile", **kwargs)
+
     # See NOTE: [Deferring tensor pack/unpack hooks until runtime]
     # If any saved tensor hooks are active, we **don't** want to trace them.
     # Instead, we'll let them run at runtime, around the custom autograd.Function
@@ -634,6 +645,9 @@ def _dup_fake_script_obj(fake_flat_args):
                 req_subclass_dispatch = requires_subclass_dispatch(
                     fake_flat_args, fw_metadata
                 )
+                try_record_chromium_data(
+                    requires_subclass_dispatch=req_subclass_dispatch
+                )
 
                 output_and_mutation_safe = not any(
                     x.requires_grad
@@ -752,10 +766,13 @@ def choose_dispatcher(needs_autograd, aot_config):
             if aot_config.is_export:
                 # export uses just the "graph bits", whereas the other
                 # two dispatchers include some extra work around handling a runtime epilogue
+                try_record_chromium_data(dispatch_mode="export")
                 return partial(aot_dispatch_export, needs_autograd=needs_autograd)
             elif needs_autograd and not aot_config.pre_dispatch:
+                try_record_chromium_data(dispatch_mode="autograd")
                 return aot_dispatch_autograd
             else:
+                try_record_chromium_data(dispatch_mode="inference")
                 return aot_dispatch_base
 
         compiler_fn = choose_dispatcher(needs_autograd, aot_config)

From 49ed365b226f8aa5f6bf94a1e3ced39ccefbc062 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sat, 26 Oct 2024 15:07:10 +0000
Subject: [PATCH 116/161] [BE]: Update Typeguard to TypeIs for better type
 inference (#133814)

Uses TypeIs instead of TypeGuard for better inference. See https://peps.python.org/pep-0742/

Pull Request resolved: https://github.com/pytorch/pytorch/pull/133814
Approved by: https://github.com/ezyang
---
 .ci/docker/requirements-ci.txt     | 2 +-
 pyproject.toml                     | 2 +-
 requirements.txt                   | 2 +-
 setup.py                           | 2 +-
 torch/__init__.py                  | 6 +++---
 torch/_dynamo/utils.py             | 6 +++---
 torch/_inductor/pattern_matcher.py | 8 ++++----
 torch/_subclasses/fake_tensor.py   | 6 +++---
 torch/masked/maskedtensor/core.py  | 4 ++--
 torch/nn/parameter.pyi             | 4 ++--
 torch/serialization.py             | 4 ++--
 torch/utils/_python_dispatch.py    | 6 +++---
 12 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index d25a290f654aa..f530c42d09f6d 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -257,7 +257,7 @@ tb-nightly==2.13.0a20230426
 #test that import:
 
 # needed by torchgen utils
-typing-extensions
+typing-extensions>=4.10.0
 #Description: type hints for python
 #Pinned versions:
 #test that import:
diff --git a/pyproject.toml b/pyproject.toml
index 4f1cf077a7e0c..c15594e54a737 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "ninja",
     "pyyaml",
     "cmake",
-    "typing-extensions",
+    "typing-extensions>=4.10.0",
     "requests",
 ]
 # Use legacy backend to import local packages in setup.py
diff --git a/requirements.txt b/requirements.txt
index f22947eb2eb70..6ce86e87d8927 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,7 @@ requests
 # is required until pytorch build not refactored to work for latest setuptools.
 setuptools<=72.1.0
 types-dataclasses
-typing-extensions>=4.8.0
+typing-extensions>=4.10.0
 sympy==1.13.1 ; python_version >= "3.9"
 filelock
 networkx
diff --git a/setup.py b/setup.py
index 3aec923516133..576b635c8a260 100644
--- a/setup.py
+++ b/setup.py
@@ -1159,7 +1159,7 @@ def main():
         )
     install_requires = [
         "filelock",
-        "typing-extensions>=4.8.0",
+        "typing-extensions>=4.10.0",
         'setuptools ; python_version >= "3.12"',
         'sympy==1.13.1 ; python_version >= "3.9"',
         "networkx",
diff --git a/torch/__init__.py b/torch/__init__.py
index a4073d43ff89c..5ff3c610abff6 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -34,7 +34,7 @@
     TypeVar as _TypeVar,
     Union as _Union,
 )
-from typing_extensions import ParamSpec as _ParamSpec, TypeGuard as _TypeGuard
+from typing_extensions import ParamSpec as _ParamSpec, TypeIs as _TypeIs
 
 
 if TYPE_CHECKING:
@@ -1033,7 +1033,7 @@ def typename(obj: _Any, /) -> str:
     return f"{module}.{qualname}"
 
 
-def is_tensor(obj: _Any, /) -> _TypeGuard["torch.Tensor"]:
+def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     r"""Returns True if `obj` is a PyTorch tensor.
 
     Note that this function is simply doing ``isinstance(obj, Tensor)``.
@@ -1053,7 +1053,7 @@ def is_tensor(obj: _Any, /) -> _TypeGuard["torch.Tensor"]:
     return isinstance(obj, torch.Tensor)
 
 
-def is_storage(obj: _Any, /) -> _TypeGuard[_Union["TypedStorage", "UntypedStorage"]]:
+def is_storage(obj: _Any, /) -> _TypeIs[_Union["TypedStorage", "UntypedStorage"]]:
     r"""Returns True if `obj` is a PyTorch storage object.
 
     Args:
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 696cf371b4233..775ec8b488dd9 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -56,7 +56,7 @@
     Union,
     ValuesView,
 )
-from typing_extensions import Literal, TypeGuard
+from typing_extensions import Literal, TypeIs
 
 import torch
 import torch._functorch.config
@@ -582,14 +582,14 @@ def clear(self):
 
 
 @overload
-def istype(obj: object, allowed_types: Type[T]) -> TypeGuard[T]:
+def istype(obj: object, allowed_types: Type[T]) -> TypeIs[T]:
     ...
 
 
 @overload
 def istype(
     obj: object, allowed_types: Tuple[Type[List[T]], Type[Tuple[T, ...]]]
-) -> TypeGuard[T]:
+) -> TypeIs[T]:
     ...
 
 
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 061ddcb7c6c83..a45a7505b6cd1 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -70,7 +70,7 @@
     TypeVar,
     Union,
 )
-from typing_extensions import Self, TypeGuard
+from typing_extensions import Self, TypeIs
 
 import torch
 import torch._guards
@@ -305,10 +305,10 @@ def __bool__(self) -> bool:
 MatchResult = Union[Match, FailedMatch]
 
 
-def is_match(m: MatchResult) -> TypeGuard[Match]:
+def is_match(m: MatchResult) -> TypeIs[Match]:
     """
-    TypeGuards cannot act on `self`. Thus this function exists to let mypy
-    recognize FailedMatch.__bool__ as a TypeGuard.
+    TypeIs cannot act on `self`. Thus this function exists to let mypy
+    recognize FailedMatch.__bool__ as a TypeIs.
     """
     return bool(m)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index df2fef74e127e..565b3a2fc66ae 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -32,7 +32,7 @@
     TypeVar,
     Union,
 )
-from typing_extensions import Self, TypeGuard
+from typing_extensions import Self, TypeIs
 from weakref import ReferenceType
 
 import torch
@@ -170,7 +170,7 @@ def get_plain_tensors(
     return plain_tensors
 
 
-def is_fake(x: object) -> TypeGuard[Tensor]:
+def is_fake(x: object) -> TypeIs[Tensor]:
     if isinstance(x, FakeTensor):
         return True
     if is_traceable_wrapper_subclass(x):
@@ -1214,7 +1214,7 @@ def reset_nt_tensor_id_counter(self) -> None:
     # In this case, it's insufficient to test only one FakeTensor: you need
     # to distinguish between our fake tensor and other fake tensors.  That's
     # what this function does.
-    def is_our_fake(self, t: object) -> TypeGuard[FakeTensor]:
+    def is_our_fake(self, t: object) -> TypeIs[FakeTensor]:
         return isinstance(t, FakeTensor) and t.fake_mode is self
 
     # If we should avoid device init. This changes the behavior of various APIs:
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 22f98b7a31824..d1cc620325933 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -3,7 +3,7 @@
 
 import warnings
 from typing import Any
-from typing_extensions import TypeGuard
+from typing_extensions import TypeIs
 
 import torch
 from torch.overrides import get_default_nowrap_functions
@@ -15,7 +15,7 @@
 ]
 
 
-def is_masked_tensor(obj: Any, /) -> TypeGuard["MaskedTensor"]:
+def is_masked_tensor(obj: Any, /) -> TypeIs["MaskedTensor"]:
     r"""Returns True if the input is a MaskedTensor, else False
 
     Args:
diff --git a/torch/nn/parameter.pyi b/torch/nn/parameter.pyi
index 9c998fb07f2c1..6b5afa860b863 100644
--- a/torch/nn/parameter.pyi
+++ b/torch/nn/parameter.pyi
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing_extensions import TypeGuard
+from typing_extensions import TypeIs
 
 from torch import device, dtype, Tensor
 
@@ -8,7 +8,7 @@ class Parameter(Tensor):
 
 def is_lazy(
     param: Tensor,
-) -> TypeGuard[UninitializedParameter | UninitializedBuffer]: ...
+) -> TypeIs[UninitializedParameter | UninitializedBuffer]: ...
 
 class UninitializedParameter(Tensor):
     def __init__(self, data: Tensor = ..., requires_grad: bool = ...) -> None: ...
diff --git a/torch/serialization.py b/torch/serialization.py
index 20ef32f2c82b2..17517db6e7fd1 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -28,7 +28,7 @@
     Type,
     Union,
 )
-from typing_extensions import TypeAlias, TypeGuard  # Python 3.10+
+from typing_extensions import TypeAlias, TypeIs
 
 import torch
 import torch._weights_only_unpickler as _weights_only_unpickler
@@ -620,7 +620,7 @@ def storage_to_tensor_type(storage):
     return getattr(module, storage_type.__name__.replace("Storage", "Tensor"))
 
 
-def _is_path(name_or_buffer) -> TypeGuard[Union[str, os.PathLike]]:
+def _is_path(name_or_buffer) -> TypeIs[Union[str, os.PathLike]]:
     return isinstance(name_or_buffer, (str, os.PathLike))
 
 
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index bf0853f1fe491..04604bc6ec59e 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -4,7 +4,7 @@
 import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Set, Union, Protocol, Tuple, Sequence, overload, Deque, Type
-from typing_extensions import TypeGuard
+from typing_extensions import TypeIs
 from collections import deque
 
 import torch
@@ -365,7 +365,7 @@ def to(
 
 
 
-def is_traceable_wrapper_subclass(t: object) -> TypeGuard[TensorWithFlatten]:
+def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
     """
     Returns whether or not a tensor subclass that implements __torch_dispatch__
     is 'traceable' with torch.compile.
@@ -402,7 +402,7 @@ def is_traceable_wrapper_subclass(t: object) -> TypeGuard[TensorWithFlatten]:
         and hasattr(t, "__tensor_unflatten__")
     )
 
-def is_traceable_wrapper_subclass_type(t: Type) -> TypeGuard[Type[TensorWithFlatten]]:
+def is_traceable_wrapper_subclass_type(t: Type) -> TypeIs[Type[TensorWithFlatten]]:
     """Same as above, but takes a type argument instead of an instance."""
     return (issubclass(t, torch.Tensor) and t != torch.Tensor
             and hasattr(t, "__tensor_flatten__") and hasattr(t, "__tensor_unflatten__"))

From dba6887dc6df29a841c126c5f5634787ac625433 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 25 Oct 2024 12:14:35 -0700
Subject: [PATCH 117/161] [dynamo][refactor][config-cleanp] Use guard_manager
 consistently instead of check_fn (#138896)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138896
Approved by: https://github.com/williamwen42, https://github.com/jansel
ghstack dependencies: #138512
---
 test/dynamo/test_frame_init.py    |   2 +-
 torch/_dynamo/cache_size.py       |  10 +--
 torch/_dynamo/convert_frame.py    |   2 +-
 torch/_dynamo/guards.py           | 107 ++++++++++++++----------------
 torch/_dynamo/testing.py          |   2 +-
 torch/_dynamo/types.py            |   6 +-
 torch/csrc/dynamo/cache_entry.cpp |  12 ++--
 torch/csrc/dynamo/cache_entry.h   |   9 +--
 torch/csrc/dynamo/extra_state.cpp |  10 +--
 torch/csrc/dynamo/init.cpp        |   2 +-
 10 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/test/dynamo/test_frame_init.py b/test/dynamo/test_frame_init.py
index 00206d52e3936..97aac1870e984 100644
--- a/test/dynamo/test_frame_init.py
+++ b/test/dynamo/test_frame_init.py
@@ -87,7 +87,7 @@ def test_frame_init(self):
             target_with_varkwargs.__code__: varkwargs_code2.__code__,
         }
 
-        empty_guard_manager = torch._dynamo.guards.GuardManager()
+        empty_guard_manager = torch._dynamo.guards.GuardManagerWrapper()
 
         def callback1(frame, cache_entry, frame_state):
             if frame.f_code in code_map1:
diff --git a/torch/_dynamo/cache_size.py b/torch/_dynamo/cache_size.py
index 5c675ad052907..1d0c169345d2e 100644
--- a/torch/_dynamo/cache_size.py
+++ b/torch/_dynamo/cache_size.py
@@ -15,10 +15,10 @@
 [Note on cache size limit]
 
 Background - TorchDynamo cache is a linked list. Each cache entry is a
-(check_fn, out_code, next pointer). These are stored on the f_code's co_extra
+(guard_manager, out_code, next pointer). These are stored on the f_code's co_extra
 scratch space. When a frame is invoked, we walk this linked list and run
-check_fn in each cache_entry to decide if the frame needs recompilation. If none
-of the check_fn's returns True, we recompile and add a new entry. To ensure we
+guard_manager in each cache_entry to decide if the frame needs recompilation. If none
+of the guard_manager's returns True, we recompile and add a new entry. To ensure we
 don't end up recompiling infinitely, we put limits on the cache size.
 
 There are two limits
@@ -121,7 +121,7 @@ def _has_same_id_matched_objs(frame: types.FrameType, cache_entry) -> bool:
     for (
         local_name,
         weakref_from_cache_entry,
-    ) in cache_entry.check_fn.id_matched_objs.items():
+    ) in cache_entry.guard_manager.id_matched_objs.items():
         if weakref_from_cache_entry() is not None:
             weakref_from_frame = _get_weakref_from_f_locals(frame, local_name)
             if weakref_from_frame is not weakref_from_cache_entry:
@@ -176,7 +176,7 @@ def exceeds_cache_size_limit(
     if cache_size.will_compilation_exceed_specific_limit(config.cache_size_limit):
         return True, "cache_size_limit"
     # NOTE this check is needed in the case that the frame's cache doesn't grow
-    # and we keep recompiling. This can happen if the guard check_fn becomes invalidated,
+    # and we keep recompiling. This can happen if the guard guard_manager becomes invalidated,
     # e.g. due to guarded objects being freed. This technically makes the
     # will_compilation_exceed_accumulated_limit check unnecessary, but we will keep the
     # check in case we have a better fix in the future.
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 6a313b08c64c8..a3aa8eb00e4b4 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -842,7 +842,7 @@ def count_args(code: CodeType) -> int:
         compile_id_str = str(compile_id) if compile_id is not None else "Unknown"
         annotation_str = "Torch-Compiled Region: " + compile_id_str
         guarded_code = GuardedCode(
-            out_code, check_fn.check_fn, compile_id, annotation_str
+            out_code, check_fn.guard_manager, compile_id, annotation_str  # type: ignore[arg-type]
         )
 
         if not output.is_empty_graph() and hooks.guard_export_fn is not None:
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 717eb1499c799..51706da78f12c 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -145,7 +145,7 @@
 verbose_guards_log = torch._logging.getArtifactLogger(__name__, "verbose_guards")
 
 
-class GuardManager:
+class GuardManagerWrapper:
     """
     A helper class that contains the root guard manager. An instance of this
     class is stored in the Dynamo cache entry, so that the cache entry can
@@ -526,7 +526,7 @@ def __init__(
         lookup_weakrefs: Callable[[object], ReferenceType[object]],
         local_scope: Dict[str, object],
         global_scope: Dict[str, object],
-        guard_manager: GuardManager,
+        guard_manager: GuardManagerWrapper,
         check_fn_manager: CheckFunctionManager,
     ):
         self.id_ref = id_ref
@@ -570,7 +570,7 @@ def __init__(
         self.tensor_check_names: List[str] = []
         self.tensor_check_examples: List[torch.Tensor] = []
         self.tensor_check_guards: List[Guard] = []
-        self.tensor_check_guard_managers: List[GuardManager] = []
+        self.tensor_check_guard_managers: List[GuardManagerWrapper] = []
 
         self.check_fn_manager: CheckFunctionManager = check_fn_manager
 
@@ -583,7 +583,7 @@ def __init__(
             self.key_order_guarded_dict_ids.add(id(self.get(source_name)))
 
         # Keep track of weak references of objects with ID_MATCH guard. This
-        # info is stored alongside optimized_code and check_fn and is used to
+        # info is stored alongside optimized_code and guard_manager and is used to
         # limit the number of cache entries with same ID_MATCH'd object.
         self.id_matched_objs: Dict[str, ReferenceType[object]] = {}
 
@@ -591,7 +591,6 @@ def __init__(
         self._cached_guard_managers: Dict[
             str, torch._C._dynamo.guards.GuardManager
         ] = {}
-
         self._cached_duplicate_input_guards: Set[Tuple[str, str]] = set()
 
     def guard_on_dict_keys_and_ignore_order(self, example_value, guard):
@@ -2111,7 +2110,7 @@ def __init__(
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: Dict[int, ReferenceType[object]] = {}
-        self.guard_manager = GuardManager()
+        self.guard_manager = GuardManagerWrapper()
         self.output_graph = output_graph
         w_builder = None
 
@@ -2171,17 +2170,17 @@ def cleanup_builder(weak_b):
 
             guard.create(builder)
 
-        self.check_fn = self.compile_check_fn(builder, guards, guard_fail_fn)
+        self.compile_check_fn(builder, guards, guard_fail_fn)
 
         # Keep track of weak references of objects with ID_MATCH guard. This
-        # info is stored alongside optimized_code and check_fn and is used to
+        # info is stored alongside optimized_code and guard_manager and is used to
         # limit the number of cache entries with same ID_MATCH'd object.
         # TODO(anijain2305) - Currently this information is stored as an attr on
-        # the check_fn itself to avoid changing CacehEntry datastructure in
-        # eval_frame.c. In future, we should probably replace check_fn with a
+        # the guard_manager itself to avoid changing CacheEntry data structure in
+        # eval_frame.c. In future, we should probably replace guard_manager with a
         # queryable data structure such that this information is already present
         # in some form.
-        self.check_fn.id_matched_objs = builder.id_matched_objs
+        self.guard_manager.id_matched_objs = builder.id_matched_objs
 
         # TODO: don't do the string rep, do something more structured here
         torch._logging.trace_structured(
@@ -2189,7 +2188,6 @@ def cleanup_builder(weak_b):
         )
         guards_log.debug("%s", self.guard_manager)
         self.guard_manager.id_matched_objs = builder.id_matched_objs
-        self.check_fn = self.guard_manager
 
         # Check that the guard returns True. False means that we will always
         # recompile.
@@ -2351,45 +2349,39 @@ def add_code_part(code_part, guard, log_only=False):
         }
 
         globals_for_guard_fn = {"G": builder.scope["G"]}
-        # Guard manager construction is complete
-        # TODO (anijain2305) - When enable_cpp_guard_manager is ON by
-        # default, change the guard_fn name to be guard_manager everywhere
-        # to avoid confusion.
-        guard_fn = self.guard_manager
-        # Ensure we did not miss to insert a guard in cpp guard manager.
+        # Guard manager construction is complete. Ensure we did not miss to
+        # insert a guard in cpp guard manager.
         assert len(code_parts) == 0
 
-        guard_fn.closure_vars = closure_vars
-        # TODO(whc) maybe '.code_parts' was only kept around for the guard callback? so we don't need both
-        guard_fn.args = largs
-        guard_fn.populate_code_parts_for_debugging()
-        guard_fn.verbose_code_parts = verbose_code_parts
+        self.guard_manager.closure_vars = closure_vars
+        self.guard_manager.args = largs
+        self.guard_manager.populate_code_parts_for_debugging()
+        self.guard_manager.verbose_code_parts = verbose_code_parts
         # Grab only G, but preserve "G" because guards access it as "G"
-        guard_fn.global_scope = globals_for_guard_fn
-        guard_fn.guard_fail_fn = guard_fail_fn
+        self.guard_manager.global_scope = globals_for_guard_fn
+        self.guard_manager.guard_fail_fn = guard_fail_fn
         # will be populated by a non-owning reference to CacheEntry/ExtraState
         # when the CacheEntry is constructed
-        guard_fn.cache_entry = None
-        guard_fn.extra_state = None
-        guard_fn.no_tensor_aliasing_sources = tensor_check_names
-        return guard_fn
+        self.guard_manager.cache_entry = None
+        self.guard_manager.extra_state = None
+        self.guard_manager.no_tensor_aliasing_sources = tensor_check_names
 
     def invalidate(self):
         # Some tests reveal that CheckFunctionManager has no attribute
-        # check_fn, but this case should not be of any concern.
+        # guard_manager, but this case should not be of any concern.
         # This case doesn't seem easy to repro.
         if (
-            hasattr(self, "check_fn")
-            and self.check_fn is not DeletedGuardFn
-            and (cache_entry := self.check_fn.cache_entry) is not None
-            and (extra_state := self.check_fn.extra_state) is not None
+            hasattr(self, "guard_manager")
+            and self.guard_manager is not DeletedGuardFn
+            and (cache_entry := self.guard_manager.cache_entry) is not None
+            and (extra_state := self.guard_manager.extra_state) is not None
         ):
             assert isinstance(cache_entry, CacheEntry)
             assert isinstance(extra_state, ExtraState)
             extra_state.invalidate(cache_entry)
-            self.check_fn.cache_entry = None
-            self.check_fn.extra_state = None
-            self.check_fn = DeletedGuardFn
+            self.guard_manager.cache_entry = None
+            self.guard_manager.extra_state = None
+            self.guard_manager = DeletedGuardFn  # type: ignore[assignment]
 
     def id_ref(self, obj):
         """add a weakref, return the id"""
@@ -2499,23 +2491,22 @@ def recompilation_reason_for_no_tensor_aliasing_guard(guard_manager, scope):
 
 
 def get_guard_fail_reason_helper(
-    guard_fn: GuardFn,
+    guard_manager: GuardFn,
     f_locals: Dict[str, object],
     compile_id: CompileId,
 ) -> str:
     """
-    Return the reason why `guard_fn` failed.
+    Return the reason why `guard_manager` failed.
     Updates `guard_failures` with the generated reason.
-    Only the first failed check of guard_fn is reported.
+    Only the first failed check of guard_manager is reported.
     """
-    scope = {"L": f_locals, "G": guard_fn.global_scope["G"]}
-    scope.update(guard_fn.closure_vars)
+    scope = {"L": f_locals, "G": guard_manager.global_scope["G"]}
+    scope.update(guard_manager.closure_vars)
     reasons: List[str] = []
 
     no_tensor_aliasing_check_failed = False
 
     verbose_code_parts: List[str] = []
-    guard_manager = guard_fn
     guard_debug_info = guard_manager.check_verbose(f_locals)  # type: ignore[attr-defined]
     # For test_export_with_map_cond, the check_verbose fail even without the
     # C++ guard manager. We need to fix the issue to remove the comment.
@@ -2537,10 +2528,12 @@ def get_guard_fail_reason_helper(
                 verbose_code_parts = []
 
     if no_tensor_aliasing_check_failed:
-        reasons = recompilation_reason_for_no_tensor_aliasing_guard(guard_fn, scope)
+        reasons = recompilation_reason_for_no_tensor_aliasing_guard(
+            guard_manager, scope
+        )
     else:
         for part in verbose_code_parts:
-            global_scope = dict(guard_fn.global_scope)
+            global_scope = dict(guard_manager.global_scope)
             global_scope["__compile_source__"] = part
             with report_compile_source_on_error():
                 try:
@@ -2565,17 +2558,17 @@ def get_guard_fail_reason_helper(
 
 
 def get_guard_fail_reason(
-    guard_fn: GuardFn,
+    guard_manager: GuardFn,
     code: types.CodeType,
     f_locals: Dict[str, object],
     compile_id: CompileId,
 ) -> str:
-    reason_str = get_guard_fail_reason_helper(guard_fn, f_locals, compile_id)
+    reason_str = get_guard_fail_reason_helper(guard_manager, f_locals, compile_id)
     guard_failures[orig_code_map[code]].append(reason_str)
 
     try:
-        if guard_fn.guard_fail_fn is not None:
-            guard_fn.guard_fail_fn(
+        if guard_manager.guard_fail_fn is not None:
+            guard_manager.guard_fail_fn(
                 GuardFail(reason_str or "unknown reason", orig_code_map[code])
             )
     except Exception as e:
@@ -2597,7 +2590,7 @@ def get_and_maybe_log_recompilation_reason(
     reasons = []
     while cache_entry is not None:
         reason = get_guard_fail_reason(
-            cache_entry.check_fn,
+            cache_entry.guard_manager,
             cache_entry.code,
             frame.f_locals,
             cache_entry.compile_id,
@@ -2647,7 +2640,7 @@ def get_and_maybe_log_recompilation_reason(
 
 
 def guard_error_hook(
-    guard_fn: GuardFn,
+    guard_manager: GuardFn,
     code: types.CodeType,
     f_locals: Dict[str, object],
     index: int,
@@ -2656,15 +2649,15 @@ def guard_error_hook(
     print(
         f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
     )
-    print("lambda " + ", ".join(guard_fn.args) + ":")
-    print(" ", " and\n  ".join(guard_fn.code_parts))
+    print("lambda " + ", ".join(guard_manager.args) + ":")
+    print(" ", " and\n  ".join(guard_manager.code_parts))
 
-    print(guard_fn)
+    print(guard_manager)
 
-    local_scope = {"L": f_locals, **guard_fn.closure_vars}
-    for guard in guard_fn.code_parts:
+    local_scope = {"L": f_locals, **guard_manager.closure_vars}
+    for guard in guard_manager.code_parts:
         try:
-            eval(guard, guard_fn.global_scope, local_scope)
+            eval(guard, guard_manager.global_scope, local_scope)
         except:  # noqa: B001,E722
             print(f"Malformed guard:\n{guard}")
 
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index bbc5f27713ab4..9281c7c7e284e 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -191,7 +191,7 @@ def insert_nops(instructions: List[Any], code_options: Any) -> None:
         torch_function_mode_stack=[],
     )
 
-    return GuardedCode(code, CheckFunctionManager(graph).check_fn, CompileId(0, 0))
+    return GuardedCode(code, CheckFunctionManager(graph).guard_manager, CompileId(0, 0))  # type: ignore[arg-type]
 
 
 class CompileCounter:
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 16ef7b5821c2a..298741a4e9586 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -3,7 +3,7 @@
 import types
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Protocol, Union
 
-# CacheEntry has a `check_fn` field for the guard, and a `code` field for the code object.
+# CacheEntry has a `guard_manager` field for the guard, and a `code` field for the code object.
 from torch._C._dynamo.eval_frame import (
     _CacheEntry as CacheEntry,
     _ExtraState as ExtraState,
@@ -46,7 +46,7 @@ def __call__(self, f_locals: Dict[str, object]) -> bool:
 @dataclasses.dataclass
 class GuardedCode:
     code: types.CodeType
-    check_fn: GuardFn
+    guard_manager: GuardFn
     compile_id: CompileId
     trace_annotation: str = "Unknown"
 
@@ -67,7 +67,7 @@ def __call__(
 class DynamoGuardHook(Protocol):
     def __call__(
         self,
-        guard_fn: GuardFn,
+        guard_manager: GuardFn,
         code: types.CodeType,
         f_locals: Dict[str, object],
         index: int,
diff --git a/torch/csrc/dynamo/cache_entry.cpp b/torch/csrc/dynamo/cache_entry.cpp
index 6ea8a441c48fb..2dc4bbece04b6 100644
--- a/torch/csrc/dynamo/cache_entry.cpp
+++ b/torch/csrc/dynamo/cache_entry.cpp
@@ -6,7 +6,7 @@
 
 CacheEntry::CacheEntry(const py::handle& guarded_code, PyObject* backend)
     : backend{backend} {
-  this->check_fn = guarded_code.attr("check_fn");
+  this->guard_manager = guarded_code.attr("guard_manager");
   this->code = guarded_code.attr("code");
   this->compile_id = guarded_code.attr("compile_id");
   py::object trace_annotation = guarded_code.attr("trace_annotation");
@@ -16,8 +16,8 @@ CacheEntry::CacheEntry(const py::handle& guarded_code, PyObject* backend)
   } else {
     this->trace_annotation = "Unknown";
   }
-  this->root_mgr =
-      torch::dynamo::convert_to_root_guard_manager(this->check_fn.attr("root"));
+  this->root_mgr = torch::dynamo::convert_to_root_guard_manager(
+      this->guard_manager.attr("root"));
 }
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
@@ -25,9 +25,9 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated-copy-dtor")
 // NOLINTNEXTLINE(bugprone-exception-escape)
 CacheEntry::~CacheEntry() {
-  // prevent check_fn from use-after-free when invalidating
-  this->check_fn.attr("cache_entry") = py::none();
-  this->check_fn.attr("extra_state") = py::none();
+  // prevent guard_manager from use-after-free when invalidating
+  this->guard_manager.attr("cache_entry") = py::none();
+  this->guard_manager.attr("extra_state") = py::none();
 }
 C10_DIAGNOSTIC_POP()
 C10_DIAGNOSTIC_POP()
diff --git a/torch/csrc/dynamo/cache_entry.h b/torch/csrc/dynamo/cache_entry.h
index 7d1d92084444c..9747c0baa421a 100644
--- a/torch/csrc/dynamo/cache_entry.h
+++ b/torch/csrc/dynamo/cache_entry.h
@@ -18,11 +18,12 @@ of the cache is as follows:
 
 -> ExtraState
   -> CacheEntry (list)
-    -> check_fn
+    -> guard_manager (a wrapper that contains the actual guard manager at its
+attr named root)
     -> code
   -> FrameState
 
-CacheEntry is a linked list node containing the check_fn for guards
+CacheEntry is a linked list node containing the guard_manager for guards
 and the optimized code.
 
 The FrameState is a PyDict that enables sharing between different frames. This
@@ -41,8 +42,8 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated-copy-dtor")
 typedef struct VISIBILITY_HIDDEN CacheEntry {
   // check the guards: lambda: <locals of user function>: bool
-  py::object check_fn;
-  // modified user bytecode (protected by check_fn's guards)
+  py::object guard_manager;
+  // modified user bytecode (protected by guard_manager's guards)
   py::object code;
   // CompileId corresponding to this compilation
   py::object compile_id;
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index 1c1632b22746b..7ee7961096556 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -132,7 +132,7 @@ void lookup(
         if (guard_error_hook) {
           py::handle guard_error_hook_handle(guard_error_hook);
           guard_error_hook_handle(
-              cache_entry.check_fn,
+              cache_entry.guard_manager,
               cache_entry.code,
               locals,
               index,
@@ -168,12 +168,12 @@ CacheEntry* create_cache_entry(
   auto new_iter = extra_state->cache_entry_list.begin();
   new_iter->_owner = extra_state;
   new_iter->_owner_loc = new_iter;
-  // Set check_fn references to extra_state and CacheEntry
+  // Set guard_manager references to extra_state and CacheEntry
   // Warning: lifetime is controlled by C++!
-  py::handle check_fn = py::handle(guarded_code).attr("check_fn");
-  check_fn.attr("cache_entry") =
+  py::handle guard_manager = py::handle(guarded_code).attr("guard_manager");
+  guard_manager.attr("cache_entry") =
       py::cast(*new_iter, py::return_value_policy::reference);
-  check_fn.attr("extra_state") =
+  guard_manager.attr("extra_state") =
       py::cast(extra_state, py::return_value_policy::reference);
   return &*new_iter;
 }
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index 5993c25caace1..16a3f1e2c9736 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -67,7 +67,7 @@ void initDynamoBindings(PyObject* torch) {
   auto m = py::handle(eval_frame).cast<py::module>();
 
   py::class_<CacheEntry>(m, "_CacheEntry")
-      .def_readonly("check_fn", &CacheEntry::check_fn)
+      .def_readonly("guard_manager", &CacheEntry::guard_manager)
       .def_readonly("code", &CacheEntry::code)
       .def_readonly("compile_id", &CacheEntry::compile_id)
       .def_readonly("trace_annotation", &CacheEntry::trace_annotation)

From c84f9b2069edd06e09fc04b89cb1672733dc60b4 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 25 Oct 2024 16:40:04 -0700
Subject: [PATCH 118/161] [dynamo][guards] Log average time of constructed
 guard_manager (#138941)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138941
Approved by: https://github.com/jansel
ghstack dependencies: #138512, #138896
---
 torch/_dynamo/guards.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 51706da78f12c..f00b96300e4c9 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -16,6 +16,7 @@
 import re
 import sys
 import textwrap
+import time
 import types
 import weakref
 from contextlib import contextmanager
@@ -2202,6 +2203,9 @@ def cleanup_builder(weak_b):
                 )
                 raise AssertionError(f"Guard check failed: {reasons}")
 
+            if guards_log.isEnabledFor(logging.DEBUG):
+                self.profile_guard_eval(output_graph.local_scope)
+
         # NB - We have to very careful of cleaning up here. Because of the
         # invalidate function, we can create a weakref finalizer that keeps
         # `self` alive for very long. Sometimes by mistake, we can run
@@ -2213,6 +2217,18 @@ def cleanup_builder(weak_b):
         self._weakrefs.clear()
         self.output_graph = None
 
+    def profile_guard_eval(self, f_locals):
+        start_time = time.time()
+        iterations = 0
+        profile_duration = 1  # unit is seconds
+
+        while time.time() - start_time < profile_duration:
+            self.guard_manager.check(f_locals)
+            iterations += 1
+
+        guard_latency = 10**6 / iterations  # us
+        guards_log.debug("Guard eval latency = %s us", f"{guard_latency:.2f}")
+
     def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = builder.argnames

From 1a2dc89f170edbe2e10615a3aa32d1393729a6f9 Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Sat, 26 Oct 2024 15:26:18 +0000
Subject: [PATCH 119/161] [Dynamo] Allow `torch.cond()` to handle emply
 arguments (#138190)

Fixes #138150

```python
import torch

@torch.compile(fullgraph=True)
def foo(x, y, z):
    def f():
        return y + 2

    def g():
        return z + 1

    return torch.cond(x, f, g)

print(foo(torch.zeros(1), torch.ones(1), torch.ones(1))) # tensor([2.])
print(foo(torch.ones(1), torch.ones(1), torch.ones(1))) # tensor([3.])
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138190
Approved by: https://github.com/ezyang, https://github.com/zou3519
---
 test/dynamo/test_export.py            |  3 ++-
 test/dynamo/test_higher_order_ops.py  | 16 ++++++++++++++++
 torch/_higher_order_ops/cond.py       | 14 +++++++++++---
 torch/_higher_order_ops/while_loop.py |  2 +-
 4 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 825000734489c..78a72208b4fcd 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -3264,10 +3264,11 @@ def false_fn(x):
         def f(x):
             return cond(x.shape[0] > 10, true_fn, false_fn)
 
+        # Now we allow torch.cond to handle empty args
         example_inputs = (torch.rand(5),)
         with self.assertRaisesRegex(
             TypeError,
-            r"cond\(\) missing 1 required positional argument: 'operands'",
+            r"false_fn\(\) missing 1 required positional argument: 'x'",
         ):
             f(*example_inputs)
 
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 422b2a65306ee..ff3c83863a5b0 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -2573,6 +2573,22 @@ def fn(pred, pytree_in):
             ):
                 torch.compile(fn, backend="eager")(pred, pytree_in)
 
+    def test_cond_with_empty_operands(self):
+        @torch.compile(fullgraph=True)
+        def fn(x, y, z):
+            def true_fn():
+                return y + 2
+
+            def false_fn():
+                return z + 1
+
+            return torch.cond(x, true_fn, false_fn)
+
+        zeros = torch.zeros(1)
+        ones = torch.ones(1)
+        self.assertEqual(fn(zeros, ones, ones), torch.tensor([2.0]))
+        self.assertEqual(fn(ones, ones, ones), torch.tensor([3.0]))
+
     def test_hints_wrapper(self):
         def ref_fn(x, y):
             x = x + y
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index f0a260de40681..a90bcd1bc9e03 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -1,6 +1,8 @@
+# mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import contextlib
 import logging
+from typing import Any, Callable, List, Tuple, Union
 
 import torch
 import torch._subclasses.functional_tensor
@@ -62,7 +64,12 @@ def __call__(self, pred, true_fn, false_fn, operands):
 
 
 @exposed_in("torch")
-def cond(pred, true_fn, false_fn, operands):
+def cond(
+    pred: Union[bool, int, float, torch.Tensor],
+    true_fn: Callable,
+    false_fn: Callable,
+    operands: Union[Tuple, List] = (),
+) -> Any:
     r"""
     Conditionally applies `true_fn` or `false_fn`.
 
@@ -95,7 +102,8 @@ def cond(pred, true_branch, false_branch, operands):
           have consistent input and outputs, meaning the inputs have to be
           the same, and the outputs have to be the same type and shape.
 
-        operands (Tuple of possibly nested dict/list/tuple of torch.Tensor): A tuple of inputs to the true/false functions.
+        operands (Tuple of possibly nested dict/list/tuple of torch.Tensor): A tuple of inputs to the
+          true/false functions. It can be empty if true_fn/false_fn doesn't require input. Defaults to ().
 
     Example::
 
@@ -156,7 +164,7 @@ def _validate_input(pred, true_fn, false_fn, operands):
             )
 
         if not callable(true_fn) or not callable(false_fn):
-            raise RuntimeError("Expect both branches to be callbale.")
+            raise RuntimeError("Expect both branches to be callable.")
 
         if not isinstance(operands, (tuple, list)) or pytree.tree_any(
             lambda t: not isinstance(t, torch.Tensor), operands
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index f14321842f40b..fe8f11a9a7a36 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -129,7 +129,7 @@ def body_fn(iter, x):
 
     def _validate_input(cond_fn, body_fn, carried_inputs):
         if not callable(cond_fn) or not callable(body_fn):
-            raise RuntimeError("Expect cond_fn and body_fn to be callbale.")
+            raise RuntimeError("Expect cond_fn and body_fn to be callable.")
 
         if not isinstance(carried_inputs, (tuple, list)) or pytree.tree_any(
             lambda t: not isinstance(t, torch.Tensor), carried_inputs

From 705f5b3489a8c8b217e708163d486bc3d82d177f Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 25 Oct 2024 10:40:28 -0700
Subject: [PATCH 120/161] Several enhancements for check_results.py (#137925)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1) always generate expected_results.csv up to accuracy of first three digits
ex: 112313212312 --> 1120000000 .. etc
2) regenerate all record in  expected_results.csv and not just failed ones , why? because if we change something
by 1.3% and noise 1.5% we want to reflect that.
3) add "please update all results that changed significantly, and not only the failed ones"

```
(myenv) [lsakka@devgpu005.nha1 ~/pytorch/benchmarks/dynamo/pr_time_benchmarks (check_result_ehancements)]$ python check_results.py test_check_result/expected_test.csv te
st_check_result/result_test.csv out
WIN: benchmark ('a', 'instruction count') failed, actual result 9011111111 is -18.16% lower than expected 11011111111 ±1.00% please update the expected results.

please update all results that changed significantly, and not only the failed ones
REGRESSION: benchmark ('b', 'memory') failed, actual result 20011111111 is 99.89% higher than expected 10011111111 ±+10.00% if this is an expected regression, please update the expected results.

please update all results that changed significantly, and not only the failed ones
REGRESSION: benchmark ('c', 'something') failed, actual result 107111111111 is 969.92% higher than expected 10011111111 ±+10.00% if this is an expected regression, please update the expected results.

please update all results that changed significantly, and not only the failed ones
MISSING REGRESSION TEST: benchmark ('d', 'missing-test') does not have a regression test enabled for it.

new expected results file content if needed:
a,instruction count,9011000000,0.01
b,memory,20010000000,0.1
c,something,107100000000,0.1

There was some failures you can use the new reference expected result stored at path:out and printed above

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137925
Approved by: https://github.com/aorenste
---
 .../pr_time_benchmarks/check_results.py       | 100 +++++++++++++-----
 .../test_check_result/expected_test.csv       |   6 +-
 .../test_check_result/result_test.csv         |   8 +-
 3 files changed, 78 insertions(+), 36 deletions(-)

diff --git a/benchmarks/dynamo/pr_time_benchmarks/check_results.py b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
index 80673b7686709..8b18af47a589e 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/check_results.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/check_results.py
@@ -22,6 +22,35 @@ class ResultFileEntry:
     actual_value: int
 
 
+def replace_with_zeros(num):
+    """
+    Keeps the first three digits of an integer and replaces the rest with zeros.
+
+    Args:
+        num (int): The number to modify.
+
+    Returns:
+        int: The modified number.
+
+    Raises:
+        ValueError: If the input is not an integer.
+    """
+    # Check if input is an integer
+    if not isinstance(num, int):
+        raise ValueError("Input must be an integer")
+
+    # Calculate the number of digits to remove
+    digits_to_remove = len(str(abs(num))) - 4
+
+    # Replace digits with zeros
+    if digits_to_remove > 0:
+        modified_num = (num // 10**digits_to_remove) * 10**digits_to_remove
+    else:
+        modified_num = num
+
+    return modified_num
+
+
 def main():
     # Expected file is the file that have the results that we are comparing against.
     # Expected has the following format:
@@ -37,8 +66,7 @@ def main():
     result_file_path = sys.argv[2]
 
     # A path where a new expected results file will be written that can be used to replace expected_results.csv
-    # in case of failure. In case of no failure the content of this file will match expected_file_path, values
-    # will be changed for benchmarks that failed only.
+    # in case of failure. In case of no failure the content of this file will match expected_file_path.
     reference_expected_results_path = sys.argv[3]
 
     # Read expected data file.
@@ -85,6 +113,7 @@ def main():
         low = entry.expected_value - entry.expected_value * entry.noise_margin
         high = entry.expected_value + entry.expected_value * entry.noise_margin
         result = result_data[key].actual_value
+        ratio = float(result - entry.expected_value) * 100 / entry.expected_value
 
         def log(event_name):
             scribe.open_source_signpost(
@@ -97,37 +126,43 @@ def log(event_name):
                         "actual_value": result,
                         "expected_value": entry.expected_value,
                         "noise_margin": entry.noise_margin,
+                        "change_ratio": ratio,
                     }
                 ),
             )
 
-        ratio = float(result - entry.expected_value) * 100 / entry.expected_value
+        new_entry = copy.deepcopy(entry)
+        # only change if abs(ratio) > entry.noise_margin /4.
+        new_entry.expected_value = (
+            replace_with_zeros(result)
+            if abs(ratio) > entry.noise_margin / 4
+            else entry.expected_value
+        )
+        new_expected[key] = new_entry
 
         if result > high:
-            new_entry = copy.deepcopy(entry)
-            new_entry.expected_value = result
-            new_expected[key] = new_entry
-
             fail = True
             print(
                 f"REGRESSION: benchmark {key} failed, actual result {result} "
                 f"is {ratio:.2f}% higher than expected {entry.expected_value} ±{entry.noise_margin*100:+.2f}% "
                 f"if this is an expected regression, please update the expected results.\n"
             )
+            print(
+                "please update all results that changed significantly, and not only the failed ones"
+            )
 
             log("fail_regression")
 
         elif result < low:
-            new_entry = copy.deepcopy(entry)
-            new_entry.expected_value = result
-            new_expected[key] = new_entry
-
             fail = True
 
             print(
                 f"WIN: benchmark {key} failed, actual result {result} is {ratio:+.2f}% lower than "
                 f"expected {entry.expected_value} ±{entry.noise_margin*100:.2f}% "
-                f"please update the expected results.\n"
+                f"please update the expected results. \n"
+            )
+            print(
+                "please update all results that changed significantly, and not only the failed ones"
             )
 
             log("fail_win")
@@ -157,26 +192,33 @@ def log(event_name):
                 ),
             )
 
+    with open(reference_expected_results_path, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        for entry in new_expected.values():
+            # Write the data to the CSV file
+            # print(f"{entry.benchmark_name},{entry.metric_name,},{round(entry.expected_value)},{entry.noise_margin}")
+            writer.writerow(
+                [
+                    entry.benchmark_name,
+                    entry.metric_name,
+                    entry.expected_value,
+                    entry.noise_margin,
+                ]
+            )
+            # Three empty rows for merge conflicts.
+            writer.writerow([])
+            writer.writerow([])
+            writer.writerow([])
+
+    print("new expected results file content if needed:")
+    with open(reference_expected_results_path) as f:
+        print(f.read())
+
     if fail:
         print(
-            f"You can use the new reference expected result stored at path: {reference_expected_results_path}.\n"
+            f"There was some failures you can use the new reference expected result stored at path:"
+            f"{reference_expected_results_path} and printed above\n"
         )
-
-        with open(reference_expected_results_path, "w", newline="") as csvfile:
-            writer = csv.writer(csvfile)
-            for entry in new_expected.values():
-                # Write the data to the CSV file
-                writer.writerow(
-                    [
-                        entry.benchmark_name,
-                        entry.metric_name,
-                        round(entry.expected_value),
-                        entry.noise_margin,
-                    ]
-                )
-
-        with open(reference_expected_results_path) as f:
-            print(f.read())
         sys.exit(1)
     else:
         print("All benchmarks passed")
diff --git a/benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv b/benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv
index 830751a8547c6..a3bcac705ea62 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/test_check_result/expected_test.csv
@@ -1,3 +1,3 @@
-a, instruction count, 110, 0.01
-b, memory, 100, 0.1
-c, something, 100, 0.1
+a, instruction count, 11011111111, 0.01
+b, memory, 10011111111, 0.1
+c, something, 10011111111, 0.1
diff --git a/benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv b/benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv
index 07f6c814fbaef..f198fcd4e30d0 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/test_check_result/result_test.csv
@@ -1,4 +1,4 @@
-a, instruction count, 90
-b, memory, 200
-c, something, 107
-d, missing-test, 10
+a, instruction count, 9011111111
+b, memory, 20011111111
+c, something, 107111111111
+d, missing-test, 10111111111

From 4de93d1eadd6fc0542102ac4bdd822123c81c881 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sat, 26 Oct 2024 17:37:40 +0000
Subject: [PATCH 121/161] [BE][Ez]: Fix bad TypeIs conversion (#138990)

Fixes on TypeIs / TypeGuard conversion error. Follow up to #133814
Thanks for @ezyang for reminding me to double check the side conditions here.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138990
Approved by: https://github.com/malfet
---
 torch/_subclasses/fake_tensor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 565b3a2fc66ae..3a7677320c7c3 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -32,7 +32,7 @@
     TypeVar,
     Union,
 )
-from typing_extensions import Self, TypeIs
+from typing_extensions import Self, TypeGuard, TypeIs
 from weakref import ReferenceType
 
 import torch
@@ -1214,7 +1214,7 @@ def reset_nt_tensor_id_counter(self) -> None:
     # In this case, it's insufficient to test only one FakeTensor: you need
     # to distinguish between our fake tensor and other fake tensors.  That's
     # what this function does.
-    def is_our_fake(self, t: object) -> TypeIs[FakeTensor]:
+    def is_our_fake(self, t: object) -> TypeGuard[FakeTensor]:
         return isinstance(t, FakeTensor) and t.fake_mode is self
 
     # If we should avoid device init. This changes the behavior of various APIs:

From 1a732551026d913d214248976307adf18c0785c2 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sat, 26 Oct 2024 17:41:25 +0000
Subject: [PATCH 122/161] Concat namespaces in jit code (#138976)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138976
Approved by: https://github.com/Skylion007
---
 torch/csrc/jit/backends/backend.h                    |  6 ++----
 torch/csrc/jit/backends/backend_debug_handler.cpp    |  6 ++----
 torch/csrc/jit/backends/backend_debug_handler.h      |  6 ++----
 torch/csrc/jit/backends/backend_debug_info.cpp       |  8 ++------
 torch/csrc/jit/backends/backend_debug_info.h         |  6 ++----
 torch/csrc/jit/backends/backend_detail.h             |  6 ++----
 torch/csrc/jit/backends/backend_init.cpp             |  6 ++----
 torch/csrc/jit/backends/backend_init.h               |  6 ++----
 torch/csrc/jit/backends/backend_interface.cpp        |  6 ++----
 torch/csrc/jit/backends/backend_interface.h          |  6 ++----
 torch/csrc/jit/backends/backend_preprocess.h         |  6 ++----
 torch/csrc/jit/backends/backend_resolver.cpp         |  6 ++----
 torch/csrc/jit/backends/backend_resolver.h           |  6 ++----
 .../jit/backends/coreml/objc/PTMCoreMLTensorSpec.h   | 10 ++--------
 .../jit/backends/xnnpack/executor/xnn_executor.h     | 10 ++--------
 torch/csrc/jit/codegen/cuda/interface.cpp            | 10 ++--------
 torch/csrc/jit/codegen/cuda/interface.h              | 10 ++--------
 torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h      | 10 ++--------
 torch/csrc/jit/codegen/fuser/cpu/resource_strings.h  | 10 ++--------
 torch/csrc/jit/codegen/fuser/cpu/temp_file.h         | 10 ++--------
 torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp     | 10 ++--------
 torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h       | 10 ++--------
 torch/csrc/jit/codegen/onednn/decompose_silu.cpp     | 10 ++--------
 torch/csrc/jit/codegen/onednn/decompose_silu.h       | 10 ++--------
 torch/csrc/jit/codegen/onednn/defer_size_check.cpp   | 10 ++--------
 torch/csrc/jit/codegen/onednn/defer_size_check.h     | 10 ++--------
 torch/csrc/jit/codegen/onednn/graph_fuser.cpp        | 10 ++--------
 torch/csrc/jit/codegen/onednn/graph_fuser.h          | 10 ++--------
 torch/csrc/jit/codegen/onednn/graph_helper.cpp       | 10 ++--------
 torch/csrc/jit/codegen/onednn/graph_helper.h         | 10 ++--------
 torch/csrc/jit/codegen/onednn/graph_rewriter.cpp     | 10 ++--------
 torch/csrc/jit/codegen/onednn/guard_shape.cpp        | 10 ++--------
 torch/csrc/jit/codegen/onednn/guard_shape.h          | 10 ++--------
 torch/csrc/jit/codegen/onednn/interface.cpp          | 12 ++++--------
 torch/csrc/jit/codegen/onednn/interface.h            | 12 ++++--------
 torch/csrc/jit/codegen/onednn/kernel.cpp             | 10 ++--------
 torch/csrc/jit/codegen/onednn/kernel.h               | 10 ++--------
 torch/csrc/jit/codegen/onednn/layout_propagation.cpp | 10 ++--------
 torch/csrc/jit/codegen/onednn/layout_propagation.h   | 10 ++--------
 torch/csrc/jit/codegen/onednn/operator.h             | 10 ++--------
 torch/csrc/jit/codegen/onednn/prepare_binary.cpp     | 10 ++--------
 torch/csrc/jit/codegen/onednn/prepare_binary.h       | 10 ++--------
 torch/csrc/jit/codegen/onednn/register_interface.cpp | 10 ++--------
 torch/csrc/jit/mobile/nnc/aot_compiler.cpp           | 10 ++--------
 torch/csrc/jit/mobile/nnc/aot_compiler.h             | 10 ++--------
 torch/csrc/jit/mobile/nnc/backend.cpp                | 10 ++--------
 torch/csrc/jit/mobile/nnc/context.cpp                | 10 ++--------
 torch/csrc/jit/mobile/nnc/context.h                  | 10 ++--------
 torch/csrc/jit/mobile/nnc/registry.cpp               | 10 ++--------
 torch/csrc/jit/mobile/nnc/registry.h                 | 10 ++--------
 .../dbr_quantization/remove_redundant_aliases.cpp    |  6 ++----
 .../dbr_quantization/remove_redundant_aliases.h      |  6 ++----
 .../pattern_conversion/autograd_function_process.cpp |  6 ++----
 .../pattern_conversion/autograd_function_process.h   |  6 ++----
 .../jit/passes/onnx/pattern_conversion/common.cpp    |  6 ++----
 .../csrc/jit/passes/onnx/pattern_conversion/common.h |  6 ++----
 .../onnx/pattern_conversion/pattern_conversion.cpp   |  6 ++----
 .../onnx/pattern_conversion/pattern_conversion.h     |  6 ++----
 .../pattern_conversion/pattern_encapsulation.cpp     |  6 ++----
 .../onnx/pattern_conversion/pattern_encapsulation.h  |  6 ++----
 .../jit/passes/quantization/dedup_module_uses.cpp    |  6 ++----
 .../csrc/jit/passes/quantization/dedup_module_uses.h |  6 ++----
 torch/csrc/jit/passes/quantization/finalize.cpp      |  6 ++----
 torch/csrc/jit/passes/quantization/finalize.h        |  6 ++----
 torch/csrc/jit/passes/quantization/fusion_passes.cpp |  6 ++----
 torch/csrc/jit/passes/quantization/fusion_passes.h   |  6 ++----
 torch/csrc/jit/passes/quantization/helper.cpp        |  6 ++----
 torch/csrc/jit/passes/quantization/helper.h          |  6 ++----
 .../jit/passes/quantization/insert_observers.cpp     |  6 ++----
 .../csrc/jit/passes/quantization/insert_observers.h  |  6 ++----
 .../jit/passes/quantization/insert_quant_dequant.cpp |  6 ++----
 .../jit/passes/quantization/insert_quant_dequant.h   |  6 ++----
 .../jit/passes/quantization/quantization_patterns.h  |  6 ++----
 .../jit/passes/quantization/quantization_type.cpp    |  6 ++----
 .../csrc/jit/passes/quantization/quantization_type.h |  6 ++----
 .../passes/quantization/register_packed_params.cpp   |  6 ++----
 .../jit/passes/quantization/register_packed_params.h |  6 ++----
 .../csrc/jit/passes/utils/check_alias_annotation.cpp |  6 ++----
 torch/csrc/jit/passes/utils/check_alias_annotation.h |  6 ++----
 torch/csrc/jit/passes/utils/memory_dag.cpp           |  6 ++----
 torch/csrc/jit/passes/utils/memory_dag.h             |  6 ++----
 torch/csrc/jit/passes/utils/op_registry.cpp          |  6 ++----
 torch/csrc/jit/passes/utils/op_registry.h            |  6 ++----
 torch/csrc/jit/passes/utils/optimization_utils.cpp   |  6 ++----
 torch/csrc/jit/passes/utils/optimization_utils.h     |  6 ++----
 torch/csrc/jit/passes/utils/subgraph_utils.cpp       |  8 ++------
 torch/csrc/jit/passes/utils/subgraph_utils.h         |  9 ++-------
 torch/csrc/jit/tensorexpr/bounds_inference.h         |  8 ++------
 torch/csrc/jit/tensorexpr/bounds_overlap.h           | 10 ++--------
 torch/csrc/jit/tensorexpr/cpp_intrinsics.h           |  8 ++------
 torch/csrc/jit/tensorexpr/cuda_random.h              |  8 ++------
 torch/csrc/jit/tensorexpr/fwd_decls.h                |  8 ++------
 torch/csrc/jit/tensorexpr/ir.h                       |  8 ++------
 torch/csrc/jit/tensorexpr/ir_cloner.h                |  8 ++------
 torch/csrc/jit/tensorexpr/ir_verifier.h              |  8 ++------
 torch/csrc/jit/tensorexpr/operators/conv2d.h         |  8 ++------
 torch/csrc/jit/tensorexpr/operators/matmul.h         |  8 ++------
 torch/csrc/jit/tensorexpr/operators/norm.h           |  8 ++------
 torch/csrc/jit/tensorexpr/operators/pointwise.h      |  8 ++------
 torch/csrc/jit/tensorexpr/operators/quantization.h   |  8 ++------
 torch/csrc/jit/tensorexpr/operators/reduction.h      |  8 ++------
 torch/csrc/jit/tensorexpr/operators/softmax.h        |  8 ++------
 torch/csrc/jit/testing/file_check.cpp                |  9 ++-------
 torch/csrc/jit/testing/file_check.h                  |  6 ++----
 torch/csrc/jit/testing/hooks_for_testing.cpp         |  6 ++----
 torch/csrc/jit/testing/hooks_for_testing.h           |  6 ++----
 106 files changed, 216 insertions(+), 614 deletions(-)

diff --git a/torch/csrc/jit/backends/backend.h b/torch/csrc/jit/backends/backend.h
index 5aae642fa5517..db0205d395ddc 100644
--- a/torch/csrc/jit/backends/backend.h
+++ b/torch/csrc/jit/backends/backend.h
@@ -5,8 +5,7 @@
 #include <torch/csrc/jit/backends/backend_interface.h>
 #include <torch/custom_class.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 namespace {
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
 inline c10::FunctionSchema getIsAvailableSchema() {
@@ -115,5 +114,4 @@ class backend {
   }
 };
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_debug_handler.cpp b/torch/csrc/jit/backends/backend_debug_handler.cpp
index 13c9778c67c10..6c2ba467bc6b2 100644
--- a/torch/csrc/jit/backends/backend_debug_handler.cpp
+++ b/torch/csrc/jit/backends/backend_debug_handler.cpp
@@ -2,8 +2,7 @@
 
 #include <stack>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 std::atomic<DebugHandleType> BackendDebugInfoRecorder::unique_debug_handle_{0};
 
@@ -33,5 +32,4 @@ BackendDebugInfoMapType BackendDebugInfoRecorder::stopRecording() {
   return handles_to_inlined_callstack_ptrs_;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_debug_handler.h b/torch/csrc/jit/backends/backend_debug_handler.h
index d4b00fe340f2b..4128832e7a078 100644
--- a/torch/csrc/jit/backends/backend_debug_handler.h
+++ b/torch/csrc/jit/backends/backend_debug_handler.h
@@ -7,8 +7,7 @@
 
 #include <atomic>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 /*
  *  BackendDebugHandleManager is responsible for issuing debug handles to
@@ -136,5 +135,4 @@ class TORCH_API BackendDebugInfoRecorder {
   BackendDebugInfoMapType handles_to_inlined_callstack_ptrs_;
 };
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_debug_info.cpp b/torch/csrc/jit/backends/backend_debug_info.cpp
index 5f6fbb6d3f316..c6fdac0646724 100644
--- a/torch/csrc/jit/backends/backend_debug_info.cpp
+++ b/torch/csrc/jit/backends/backend_debug_info.cpp
@@ -1,9 +1,7 @@
 #include <c10/macros/Macros.h>
 #include <torch/csrc/jit/backends/backend_debug_info.h>
 
-namespace torch {
-namespace jit {
-namespace backend {
+namespace torch::jit::backend {
 namespace {
 #ifdef BUILD_LITE_INTERPRETER
 static auto cls = torch::class_<PyTorchBackendDebugInfoDummy>(
@@ -18,6 +16,4 @@ static auto cls = torch::class_<PyTorchBackendDebugInfo>(
 #endif
 
 } // namespace
-} // namespace backend
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::backend
diff --git a/torch/csrc/jit/backends/backend_debug_info.h b/torch/csrc/jit/backends/backend_debug_info.h
index 291eb48132e8e..d6740b6c50466 100644
--- a/torch/csrc/jit/backends/backend_debug_info.h
+++ b/torch/csrc/jit/backends/backend_debug_info.h
@@ -5,8 +5,7 @@
 #endif
 #include <torch/custom_class.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 constexpr static auto kBackendUtilsNamespace = "backendutils";
 constexpr static auto kBackendDebugInfoClass = "BackendDebugInfo";
@@ -61,5 +60,4 @@ class PyTorchBackendDebugInfoDummy : public torch::CustomClassHolder {
   PyTorchBackendDebugInfoDummy() = default;
 };
 #endif
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_detail.h b/torch/csrc/jit/backends/backend_detail.h
index 7299ce259bc8f..e69a93ebb148e 100644
--- a/torch/csrc/jit/backends/backend_detail.h
+++ b/torch/csrc/jit/backends/backend_detail.h
@@ -6,8 +6,7 @@
 
 #include <functional>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using DebugHandleType = int64_t;
 
@@ -37,5 +36,4 @@ TORCH_API Module codegen_backend_module(
     const c10::Dict<IValue, IValue>& method_compile_spec,
     const c10::DictTypePtr& any_dict_ty);
 } // namespace detail
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index 308857123d25d..380c9f0d096fe 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -7,8 +7,7 @@
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/pybind.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Get all types that are shared in the module hierarchy rooted at \p mod.
 std::unordered_set<TypePtr> getSharedModuleTypes(Module& mod) {
@@ -189,5 +188,4 @@ void initJitBackendBindings(PyObject* module) {
             "Object ", py::str(orig_module), " is not a ScriptModule"));
       });
 }
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_init.h b/torch/csrc/jit/backends/backend_init.h
index e7be08c765953..7f2aac18bd04f 100644
--- a/torch/csrc/jit/backends/backend_init.h
+++ b/torch/csrc/jit/backends/backend_init.h
@@ -3,9 +3,7 @@
 #include <torch/csrc/jit/python/pybind.h>
 #include <torch/csrc/utils/pybind.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 // Initialize Python bindings for JIT to_<backend> functions.
 void initJitBackendBindings(PyObject* module);
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_interface.cpp b/torch/csrc/jit/backends/backend_interface.cpp
index 661a9ac78b4dd..a124b8adf9253 100644
--- a/torch/csrc/jit/backends/backend_interface.cpp
+++ b/torch/csrc/jit/backends/backend_interface.cpp
@@ -1,10 +1,8 @@
 #include <torch/csrc/jit/backends/backend_interface.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 PyTorchBackendInterface::PyTorchBackendInterface() noexcept = default;
 PyTorchBackendInterface::~PyTorchBackendInterface() = default;
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_interface.h b/torch/csrc/jit/backends/backend_interface.h
index 099575da52859..331497f929d4c 100644
--- a/torch/csrc/jit/backends/backend_interface.h
+++ b/torch/csrc/jit/backends/backend_interface.h
@@ -2,8 +2,7 @@
 
 #include <torch/custom_class.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Interface for a JIT backend.
 class TORCH_API PyTorchBackendInterface : public torch::CustomClassHolder {
@@ -30,5 +29,4 @@ class TORCH_API PyTorchBackendInterface : public torch::CustomClassHolder {
       c10::IValue handle,
       c10::impl::GenericList inputs) = 0;
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_preprocess.h b/torch/csrc/jit/backends/backend_preprocess.h
index 0a256134aa96e..da4ebd5a93754 100644
--- a/torch/csrc/jit/backends/backend_preprocess.h
+++ b/torch/csrc/jit/backends/backend_preprocess.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/backends/backend_detail.h>
-namespace torch {
-namespace jit {
+namespace torch::jit {
 class backend_preprocess_register {
   std::string backend_name_;
 
@@ -14,5 +13,4 @@ class backend_preprocess_register {
     detail::registerBackendPreprocessFunction(name, preprocess);
   }
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_resolver.cpp b/torch/csrc/jit/backends/backend_resolver.cpp
index d6041a25591bc..9c113550f9a1b 100644
--- a/torch/csrc/jit/backends/backend_resolver.cpp
+++ b/torch/csrc/jit/backends/backend_resolver.cpp
@@ -2,8 +2,7 @@
 #include <torch/csrc/jit/frontend/sugared_value.h>
 #include <torch/custom_class.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 namespace {
 // Essentially ClassNamespaceValue from import_source.cpp without the
 // SourceImporterImpl reference. This helps resolve the
@@ -67,5 +66,4 @@ std::shared_ptr<Resolver> loweredModuleResolver() {
   return resolver;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/backend_resolver.h b/torch/csrc/jit/backends/backend_resolver.h
index b0d5727d9d958..9dd4483725766 100644
--- a/torch/csrc/jit/backends/backend_resolver.h
+++ b/torch/csrc/jit/backends/backend_resolver.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/frontend/resolver.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 // Create a Resolver for use in generating LoweredModules for specific backends.
 TORCH_API std::shared_ptr<Resolver> loweredModuleResolver();
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h
index 514629723047d..5aca1e51dd0b2 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h
@@ -3,10 +3,7 @@
 
 #include <string>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace coreml {
+namespace torch::jit::mobile::coreml {
 
 struct TensorSpec {
   std::string name = "";
@@ -26,7 +23,4 @@ static inline c10::ScalarType scalar_type(const std::string& type_string) {
   return c10::ScalarType::Undefined;
 }
 
-} // namespace coreml
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::coreml
diff --git a/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
index 33542d69c80e2..118af11d031fc 100644
--- a/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
+++ b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
@@ -8,10 +8,7 @@
 #include <memory>
 #include <vector>
 
-namespace torch {
-namespace jit {
-namespace xnnpack {
-namespace delegate {
+namespace torch::jit::xnnpack::delegate {
 
 class XNNExecutor {
  private:
@@ -68,7 +65,4 @@ class XNNExecutor {
   friend class XNNCompiler;
 };
 
-} // namespace delegate
-} // namespace xnnpack
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::xnnpack::delegate
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index d3e60781605e1..d91f3302d0aa8 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -9,10 +9,7 @@
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/register_ops_utils.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
+namespace torch::jit::fuser::cuda {
 
 static std::atomic<bool> cuda_fusion_guard_mode{true};
 
@@ -131,7 +128,4 @@ bool skipNode(const std::string& symbol_str, bool flip) {
       getFuserInterface()->fn_skip_n(symbol_str, flip);
 }
 
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::cuda
diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h
index 0ccdfe2c9ebd9..926e4cb5d265c 100644
--- a/torch/csrc/jit/codegen/cuda/interface.h
+++ b/torch/csrc/jit/codegen/cuda/interface.h
@@ -13,10 +13,7 @@
  * Registration is done in torch/csrc/jit/codegen/cuda/register_interface.cpp
  */
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
+namespace torch::jit::fuser::cuda {
 
 TORCH_API std::atomic<bool>& getCudaFusionGuardMode();
 
@@ -52,7 +49,4 @@ TORCH_API bool isEnabled();
 TORCH_API bool setEnabled(bool is_enabled);
 TORCH_API bool canBeEnabled();
 
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::cuda
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
index 2e6d59596323d..72a94518b92a0 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
@@ -13,10 +13,7 @@ namespace at {
 struct DynamicLibrary;
 }
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cpu {
+namespace torch::jit::fuser::cpu {
 
 // Represents a compiled CPU kernel and the metadata necessary to run it
 struct TORCH_API FusedKernelCPU : public FusedKernel {
@@ -43,7 +40,4 @@ struct TORCH_API FusedKernelCPU : public FusedKernel {
   void (*kernel)(uint32_t, void**) = nullptr;
 };
 
-} // namespace cpu
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::cpu
diff --git a/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h b/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h
index 6d8bea228cfe6..134451f335f83 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h
@@ -2,10 +2,7 @@
 
 #include <ATen/code_template.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cpu {
+namespace torch::jit::fuser::cpu {
 
 /*with type_as not checking type of its input, a fusion group can have non-fp32
 tensor as input. Correct code for this case is generated, however, nvrtc does
@@ -101,7 +98,4 @@ JIT_API void ${kernelName}(IndexType totalElements, void ** args) {
 }
 )");
 
-} // namespace cpu
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::cpu
diff --git a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
index 9fb53bc962c5b..fdb0788d0a575 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
@@ -22,10 +22,7 @@
 #include <string>
 #include <vector>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cpu {
+namespace torch::jit::fuser::cpu {
 
 #ifdef _MSC_VER
 int wmkstemps(wchar_t* tmpl, int suffix_len) {
@@ -135,7 +132,4 @@ struct TempFile {
   std::string name_;
 };
 
-} // namespace cpu
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::cpu
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
index 67ed298ca7409..d07e1fd2309e8 100644
--- a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp
@@ -4,10 +4,7 @@
 #include <c10/core/CPUAllocator.h>
 #include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 // Non-default dnnl::graph::allocator needs an allocator.
 // We would let it use c10::GetCPUAllocator's allocator,
@@ -152,9 +149,6 @@ at::ScalarType LlgaTensorDesc::aten_scalar_type() const {
   }
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
 
 #endif // AT_MKLDNN_ENABLED()
diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
index 64eed4ff481ec..d869a46e55940 100644
--- a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
+++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
@@ -6,10 +6,7 @@
 #include <oneapi/dnnl/dnnl_graph.hpp>
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 // Engine represents a device and its context. From the device kind, the engine
 // knows how to generate code for the target device and what kind of device
@@ -270,7 +267,4 @@ at::Tensor empty_llga(
 
 dnnl::graph::tensor llga_from_aten_tensor(const at::Tensor& tensor);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/decompose_silu.cpp b/torch/csrc/jit/codegen/onednn/decompose_silu.cpp
index 4d6807500cdfb..8a9e36c2973e4 100644
--- a/torch/csrc/jit/codegen/onednn/decompose_silu.cpp
+++ b/torch/csrc/jit/codegen/onednn/decompose_silu.cpp
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 static bool shouldDecomposeSilu(Node* node) {
   if (node->kind() != aten::silu) {
@@ -59,7 +56,4 @@ void DecomposeSiluForLLGA(std::shared_ptr<Graph>& graph) {
   EliminateDeadCode(graph);
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/decompose_silu.h b/torch/csrc/jit/codegen/onednn/decompose_silu.h
index 9d9a51502c833..fc4f115f1bd23 100644
--- a/torch/csrc/jit/codegen/onednn/decompose_silu.h
+++ b/torch/csrc/jit/codegen/onednn/decompose_silu.h
@@ -2,14 +2,8 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void DecomposeSiluForLLGA(std::shared_ptr<Graph>& graph);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
index 4d0f12564bd9c..ce76a3b3b760e 100644
--- a/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
+++ b/torch/csrc/jit/codegen/onednn/defer_size_check.cpp
@@ -2,10 +2,7 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/runtime/symbolic_shape_registry_util.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 class SizeCheckMover {
  private:
@@ -82,7 +79,4 @@ void DeferSizeCheck(std::shared_ptr<Graph>& graph) {
   SizeCheckMover(graph->block(), graph).run();
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/defer_size_check.h b/torch/csrc/jit/codegen/onednn/defer_size_check.h
index 6e31cf202d393..e6d654199b2ff 100644
--- a/torch/csrc/jit/codegen/onednn/defer_size_check.h
+++ b/torch/csrc/jit/codegen/onednn/defer_size_check.h
@@ -2,14 +2,8 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void DeferSizeCheck(std::shared_ptr<Graph>& graph);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.cpp b/torch/csrc/jit/codegen/onednn/graph_fuser.cpp
index 2a956362688ec..1c68edca761ba 100644
--- a/torch/csrc/jit/codegen/onednn/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_fuser.cpp
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph) {
   AliasDb db(graph);
@@ -25,7 +22,4 @@ void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph) {
   EliminateDeadCode(graph);
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_fuser.h b/torch/csrc/jit/codegen/onednn/graph_fuser.h
index ab37ad0211b7a..d0a802e273401 100644
--- a/torch/csrc/jit/codegen/onednn/graph_fuser.h
+++ b/torch/csrc/jit/codegen/onednn/graph_fuser.h
@@ -3,10 +3,7 @@
 #include <torch/csrc/jit/codegen/onednn/graph_helper.h>
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 struct WorkBlock : public std::pair<Node*, Node*> {
   using pair::pair;
@@ -47,7 +44,4 @@ class GraphRewriter {
 // torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
 void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
index 30f32f5994c1d..cc72489cec598 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 using opkind = dnnl::graph::op::kind;
 
@@ -615,7 +612,4 @@ bool LlgaNodeWrapper::useOpaqueLayout(size_t offset) const {
   return n->is(attr::output_layouts)[offset] == OPAQUE_LAYOUT;
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.h b/torch/csrc/jit/codegen/onednn/graph_helper.h
index fbb5eaa84aec7..bb81709287731 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.h
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.h
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 #define STRIDED_LAYOUT 0
 #define OPAQUE_LAYOUT 1
@@ -98,7 +95,4 @@ class LlgaNodeWrapper {
   Node* n;
 };
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
index 71e7450165691..c8d7617fe8651 100644
--- a/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_rewriter.cpp
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void GraphRewriter::cleanupSubgraphs() {
   auto curNode = *block_->nodes().rbegin();
@@ -138,7 +135,4 @@ std::optional<Node*> GraphRewriter::tryMerge(Node* consumer, Node* producer) {
   return consumer;
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/guard_shape.cpp b/torch/csrc/jit/codegen/onednn/guard_shape.cpp
index ee595b5c8d718..a71f980d631f5 100644
--- a/torch/csrc/jit/codegen/onednn/guard_shape.cpp
+++ b/torch/csrc/jit/codegen/onednn/guard_shape.cpp
@@ -5,10 +5,7 @@
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 //! [ Note -- prepareFusionGroupAndGuardOutputs implementation ]
 //! shamelessly copying code from NNC (tensorexpr_fuser)  with very little
@@ -39,7 +36,4 @@ void prepareFusionGroupAndGuardOutputs(Block* block) {
   }
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/guard_shape.h b/torch/csrc/jit/codegen/onednn/guard_shape.h
index 46f8a396a1628..227aa35d10a98 100644
--- a/torch/csrc/jit/codegen/onednn/guard_shape.h
+++ b/torch/csrc/jit/codegen/onednn/guard_shape.h
@@ -2,14 +2,8 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void prepareFusionGroupAndGuardOutputs(Block* block);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/interface.cpp b/torch/csrc/jit/codegen/onednn/interface.cpp
index 64c101e15fe7c..c3edd9f416130 100644
--- a/torch/csrc/jit/codegen/onednn/interface.cpp
+++ b/torch/csrc/jit/codegen/onednn/interface.cpp
@@ -16,10 +16,8 @@
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/jit/runtime/operator_options.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit {
+namespace fuser::onednn {
 
 void fuseGraph(std::shared_ptr<Graph>& g) {
   // Follow the process of the tensorexpr_fuser in profiling mode:
@@ -95,8 +93,7 @@ void fuseGraph(std::shared_ptr<Graph>& g) {
   }
 }
 
-} // namespace onednn
-} // namespace fuser
+} // namespace fuser::onednn
 
 static Operation createLlgaKernel(const Node* node) {
   auto kernel = std::make_shared<fuser::onednn::LlgaKernel>(node);
@@ -178,5 +175,4 @@ RegisterOperators oneDNNGuardOp({
         createLlgaGuardKernel,
         AliasAnalysisKind::FROM_SCHEMA),
 });
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/codegen/onednn/interface.h b/torch/csrc/jit/codegen/onednn/interface.h
index 26b8a307a3d5a..4fd940816308c 100644
--- a/torch/csrc/jit/codegen/onednn/interface.h
+++ b/torch/csrc/jit/codegen/onednn/interface.h
@@ -3,10 +3,8 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/pass_manager.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit {
+namespace fuser::onednn {
 
 static std::atomic<bool> onednn_enabled{false};
 
@@ -16,8 +14,7 @@ static std::atomic<bool>& getLlgaEnabled() {
 
 C10_EXPORT void fuseGraph(std::shared_ptr<Graph>& g);
 
-} // namespace onednn
-} // namespace fuser
+} // namespace fuser::onednn
 
 struct C10_EXPORT RegisterLlgaFuseGraph
     : public PassManager<RegisterLlgaFuseGraph> {
@@ -58,5 +55,4 @@ struct C10_EXPORT RegisterLlgaFuseGraph
   }
 };
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/codegen/onednn/kernel.cpp b/torch/csrc/jit/codegen/onednn/kernel.cpp
index bc127e7e59de6..fa04614e0ab03 100644
--- a/torch/csrc/jit/codegen/onednn/kernel.cpp
+++ b/torch/csrc/jit/codegen/onednn/kernel.cpp
@@ -4,10 +4,7 @@
 #include <ATen/core/functional.h>
 #include <torch/csrc/jit/jit_log.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 using namespace dnnl::graph;
 using data_type = dnnl::graph::logical_tensor::data_type;
@@ -293,7 +290,4 @@ void LlgaKernel::run(Stack& stack) {
 #endif
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/kernel.h b/torch/csrc/jit/codegen/onednn/kernel.h
index 6e32c8e3bc907..cf24190d9aac4 100644
--- a/torch/csrc/jit/codegen/onednn/kernel.h
+++ b/torch/csrc/jit/codegen/onednn/kernel.h
@@ -10,10 +10,7 @@
 
 #include <c10/util/CallOnce.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 using ArgSpec = LlgaTensorDesc;
 using ArgSpecs = std::vector<ArgSpec>;
@@ -89,7 +86,4 @@ class LlgaKernel {
   bool is_initialized_ = false;
 };
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/layout_propagation.cpp b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
index d2fdc61109903..7377f3156b103 100644
--- a/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
+++ b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
@@ -2,10 +2,7 @@
 #include <torch/csrc/jit/codegen/onednn/layout_propagation.h>
 #include <torch/csrc/jit/jit_log.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 static void LayoutPropagation(Node* n) {
   if (!LlgaGraphHelper::isLlgaSubgraph(n))
@@ -47,7 +44,4 @@ void PropagateLayout(const std::shared_ptr<Graph>& graph) {
   LayoutPropagation(graph->block());
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/layout_propagation.h b/torch/csrc/jit/codegen/onednn/layout_propagation.h
index 5e48a097cd43f..6af79ca78796a 100644
--- a/torch/csrc/jit/codegen/onednn/layout_propagation.h
+++ b/torch/csrc/jit/codegen/onednn/layout_propagation.h
@@ -2,14 +2,8 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 void PropagateLayout(const std::shared_ptr<Graph>& graph);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/operator.h b/torch/csrc/jit/codegen/onednn/operator.h
index 9cbe6c32c8d73..1a40c4438b4d8 100644
--- a/torch/csrc/jit/codegen/onednn/operator.h
+++ b/torch/csrc/jit/codegen/onednn/operator.h
@@ -4,10 +4,7 @@
 #include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 class Operator {
  public:
@@ -146,7 +143,4 @@ class Operator {
   dnnl::graph::op::kind k;
 };
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
index d09b5777f9734..19866a349f536 100644
--- a/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
+++ b/torch/csrc/jit/codegen/onednn/prepare_binary.cpp
@@ -3,10 +3,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 static bool compareConstValue(Value* v, double d) {
   auto ival = toIValue(v);
@@ -179,7 +176,4 @@ void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph) {
   ConvertScalarToTensor(graph->block());
 }
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/prepare_binary.h b/torch/csrc/jit/codegen/onednn/prepare_binary.h
index d7f90002e8fa7..beb66d8822b9d 100644
--- a/torch/csrc/jit/codegen/onednn/prepare_binary.h
+++ b/torch/csrc/jit/codegen/onednn/prepare_binary.h
@@ -2,10 +2,7 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 // Prepare binary ops for LLGA
 //
@@ -20,7 +17,4 @@ namespace onednn {
 //
 void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph);
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/codegen/onednn/register_interface.cpp b/torch/csrc/jit/codegen/onednn/register_interface.cpp
index a24f8fd14ed19..032b28909fddd 100644
--- a/torch/csrc/jit/codegen/onednn/register_interface.cpp
+++ b/torch/csrc/jit/codegen/onednn/register_interface.cpp
@@ -1,9 +1,6 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace onednn {
+namespace torch::jit::fuser::onednn {
 
 static bool canFuseNode(const Node* node) {
   switch (node->kind()) {
@@ -48,7 +45,4 @@ class RegisterInterface {
 static RegisterInterface register_interface_;
 } // namespace
 
-} // namespace onednn
-} // namespace fuser
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::fuser::onednn
diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
index 7efad835b9764..3444da98da038 100644
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
@@ -25,10 +25,7 @@
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 // TODO(mvz): temporarily disable NNC backend in mobile builds.
 /*
@@ -446,7 +443,4 @@ static c10::IValue preprocess(
 
 // static auto reg = torch::jit::backend_preprocess_register("nnc", preprocess);
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.h b/torch/csrc/jit/mobile/nnc/aot_compiler.h
index aee92906fcc51..307fd8833ee9e 100644
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.h
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.h
@@ -4,10 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/mobile/nnc/context.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 // Performs Ahead Of Time compilation of a given method in a model
 // returning the compiled function and LLVM assembly code
@@ -18,7 +15,4 @@ TORCH_API std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
     const std::vector<at::ScalarType>& types,
     const std::string& kernel_func_name = "func");
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/backend.cpp b/torch/csrc/jit/mobile/nnc/backend.cpp
index 89a96428a09b0..1cfe1bf50f1f5 100644
--- a/torch/csrc/jit/mobile/nnc/backend.cpp
+++ b/torch/csrc/jit/mobile/nnc/backend.cpp
@@ -3,10 +3,7 @@
 #include <torch/csrc/jit/backends/backend.h>
 #include <torch/csrc/jit/mobile/nnc/context.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 class NNCBackend : public PyTorchBackendInterface {
  public:
@@ -55,7 +52,4 @@ namespace {
 // static const auto cls = torch::jit::backend<NNCBackend>("nnc");
 } // namespace
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/context.cpp b/torch/csrc/jit/mobile/nnc/context.cpp
index cddbdd82c5efc..6ad10583b802a 100644
--- a/torch/csrc/jit/mobile/nnc/context.cpp
+++ b/torch/csrc/jit/mobile/nnc/context.cpp
@@ -7,10 +7,7 @@
 
 #include <torch/csrc/jit/mobile/nnc/registry.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 constexpr int64_t kProducedNNCFileFormatVersion = 0x1L;
 
@@ -342,7 +339,4 @@ Function* CompilationUnit::find_function(const c10::QualifiedName& name) const {
   return it->second.get();
 }
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/context.h b/torch/csrc/jit/mobile/nnc/context.h
index b9633ea5bfafc..c5c8b8e8897dd 100644
--- a/torch/csrc/jit/mobile/nnc/context.h
+++ b/torch/csrc/jit/mobile/nnc/context.h
@@ -8,10 +8,7 @@
 #include <ATen/core/ivalue.h>
 #include <c10/core/ScalarType.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 // Specify the requirements on an input tensor.
 // TODO: support input tensor with dynamic shape (PR #54982)
@@ -223,7 +220,4 @@ class TORCH_API CompilationUnit {
   std::unordered_map<c10::QualifiedName, std::unique_ptr<Function>> functions_;
 };
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/registry.cpp b/torch/csrc/jit/mobile/nnc/registry.cpp
index 088ac6ecd5bf8..18a15eccd23d5 100644
--- a/torch/csrc/jit/mobile/nnc/registry.cpp
+++ b/torch/csrc/jit/mobile/nnc/registry.cpp
@@ -1,13 +1,7 @@
 #include <torch/csrc/jit/mobile/nnc/registry.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 C10_DEFINE_REGISTRY(NNCKernelRegistry, NNCKernel);
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/mobile/nnc/registry.h b/torch/csrc/jit/mobile/nnc/registry.h
index c68a4f7a19c60..22d0470d994a5 100644
--- a/torch/csrc/jit/mobile/nnc/registry.h
+++ b/torch/csrc/jit/mobile/nnc/registry.h
@@ -3,10 +3,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>
 
-namespace torch {
-namespace jit {
-namespace mobile {
-namespace nnc {
+namespace torch::jit::mobile::nnc {
 
 using nnc_kernel_function_type = int(void**);
 
@@ -40,7 +37,4 @@ inline std::unique_ptr<NNCKernel> get_nnc_kernel(const std::string& id) {
 
 } // namespace registry
 
-} // namespace nnc
-} // namespace mobile
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::mobile::nnc
diff --git a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp
index 8ecab1bef9162..1d35b30c05024 100644
--- a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp
+++ b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.cpp
@@ -5,8 +5,7 @@
 #include <torch/csrc/jit/passes/quantization/helper.h>
 #include <torch/csrc/jit/runtime/graph_iterator.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 
@@ -70,5 +69,4 @@ Module DBRQuantRemoveRedundantAliases(Module& module) {
   return module;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
index 548d952014c32..1e4beba066988 100644
--- a/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
+++ b/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/jit/api/module.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // This function replaces instances of
 //
@@ -17,5 +16,4 @@ namespace jit {
 // on the module forward, if it's safe to do so.
 TORCH_API Module DBRQuantRemoveRedundantAliases(Module& module);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
index 8786af2ee7eb6..1f9b49c3c0a11 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.cpp
@@ -4,8 +4,7 @@
 #include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 void convertSubgraphToSubBlock(Block* block) {
   for (auto it = block->nodes().begin(), end = block->nodes().end();
@@ -54,5 +53,4 @@ void ONNXAutogradFunctionProcess(std::shared_ptr<Graph>& graph) {
   convertSubgraphToSubBlock(graph->block());
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h
index 4c3c07bb6711d..4b1c854fa2b61 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h
@@ -2,10 +2,8 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 TORCH_API void ONNXAutogradFunctionProcess(std::shared_ptr<Graph>& graph);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
index 3e516498272ef..4210cde0f52c1 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/common.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/jit/passes/onnx/pattern_conversion/common.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 bool IndexingPatternFinder::IsSameSource(const Node* n, const Node* m) {
   const auto source_n = n->sourceRange().source();
@@ -41,5 +40,4 @@ std::vector<Node*> IndexingPatternFinder::FetchSliceAndSelect(
   return slice_and_select_node;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/common.h b/torch/csrc/jit/passes/onnx/pattern_conversion/common.h
index eb4f12a94e4f9..34ab95aceff6f 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/common.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/common.h
@@ -4,8 +4,7 @@
 
 // Functions used by both encapsulation and conversion.
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct IndexingPatternFinder {
  public:
@@ -15,5 +14,4 @@ struct IndexingPatternFinder {
   static bool IsSameSource(const Node* n, const Node* m);
 };
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
index cd975d0375fcb..d11336a13e19f 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
@@ -12,8 +12,7 @@
 // EDITING THIS FILE? READ THIS FIRST!
 // see Note [Edit Pattern Conversion] in pattern_conversion.h
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Converting inplace index_put to ONNX
 namespace {
@@ -392,5 +391,4 @@ std::vector<Value*> ConvertPatternFromSubblock(
   return res;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
index 4fa3b0c47f99a..16fdedee947b0 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
@@ -3,8 +3,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/utils/pybind.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Introduction
 //
@@ -42,5 +41,4 @@ TORCH_API std::vector<Value*> ConvertPatternFromSubblock(
     py::dict& env,
     py::set& values_in_env);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
index 7a98567a529be..a51801ac8363c 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.cpp
@@ -7,8 +7,7 @@
 // EDITING THIS FILE? READ THIS FIRST!
 // see Note [Edit Pattern Encapsulation] in pattern_encapsulation.h
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 
@@ -87,5 +86,4 @@ std::optional<Node*> EncapsulatePatternIntoSubblock(Node* n) {
   return std::nullopt;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
index 6673d4aba3a75..1f69cb8def116 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Introduction
 //
@@ -30,5 +29,4 @@ namespace jit {
 // pattern is stored as attr::name.
 TORCH_API std::optional<Node*> EncapsulatePatternIntoSubblock(Node* n);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp b/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
index 2c83bcbc10e1f..35b19597be421 100644
--- a/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
+++ b/torch/csrc/jit/passes/quantization/dedup_module_uses.cpp
@@ -5,8 +5,7 @@
 
 #include <stack>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 namespace {
 class ModuleUseDeduper {
  public:
@@ -125,5 +124,4 @@ void DedupModuleUses(Module& module) {
   d.dedup();
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/dedup_module_uses.h b/torch/csrc/jit/passes/quantization/dedup_module_uses.h
index 0204d5f73f04f..4094704129a36 100644
--- a/torch/csrc/jit/passes/quantization/dedup_module_uses.h
+++ b/torch/csrc/jit/passes/quantization/dedup_module_uses.h
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/jit/api/module.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 /** Recursively deduplicate multiple uses of the same module by
  *  creating an instance clone for each use of the module, which means
@@ -24,5 +23,4 @@ namespace jit {
  */
 TORCH_API void DedupModuleUses(Module& module);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp
index ebbd379f8da69..f04d610643012 100644
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@@ -16,8 +16,7 @@
 
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 
@@ -275,5 +274,4 @@ Module FinalizeOnDevicePTQ(
   return module;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/finalize.h b/torch/csrc/jit/passes/quantization/finalize.h
index d73addbc387f6..8325a32110b82 100644
--- a/torch/csrc/jit/passes/quantization/finalize.h
+++ b/torch/csrc/jit/passes/quantization/finalize.h
@@ -4,8 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/quantization/quantization_type.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 /** \brief Backend specific pass to fuse dequantize - op - quantize calls
  * as quantized_op calls.
@@ -59,5 +58,4 @@ TORCH_API Module FinalizeOnDevicePTQ(
     Module& module,
     QuantType quant_type,
     const std::string& method_name);
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/fusion_passes.cpp b/torch/csrc/jit/passes/quantization/fusion_passes.cpp
index 2dbfdfe061b3a..46070c4939f02 100644
--- a/torch/csrc/jit/passes/quantization/fusion_passes.cpp
+++ b/torch/csrc/jit/passes/quantization/fusion_passes.cpp
@@ -1,8 +1,7 @@
 #include <torch/csrc/jit/passes/quantization/fusion_passes.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 void fuseQuantizeAddReluImpl(std::shared_ptr<Graph>& graph) {
@@ -59,5 +58,4 @@ void FuseQuantizedAddRelu(std::shared_ptr<Graph>& graph) {
   fuseQuantizeAddReluImpl(graph);
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/fusion_passes.h b/torch/csrc/jit/passes/quantization/fusion_passes.h
index b316fe2adab92..c741d9cdb7e56 100644
--- a/torch/csrc/jit/passes/quantization/fusion_passes.h
+++ b/torch/csrc/jit/passes/quantization/fusion_passes.h
@@ -2,8 +2,6 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 TORCH_API void FuseQuantizedAddRelu(std::shared_ptr<Graph>& graph);
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index 7eea68eb10654..4e103b32701d9 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -5,8 +5,7 @@
 
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using graph_rewrite_helper::getFuncName;
 
@@ -795,5 +794,4 @@ bool is_batchnorm3d_module(
       "__torch__.torch.nn.modules.batchnorm.BatchNorm3d");
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/helper.h b/torch/csrc/jit/passes/quantization/helper.h
index 21efbff7aa694..d6a0a326f25b7 100644
--- a/torch/csrc/jit/passes/quantization/helper.h
+++ b/torch/csrc/jit/passes/quantization/helper.h
@@ -8,8 +8,7 @@
 #include <functional>
 #include <regex>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using graph_rewrite_helper::getFuncName;
 
@@ -212,5 +211,4 @@ bool is_batchnorm3d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp
index 9aacd481a55b0..4a0d600ca1b94 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp
@@ -17,8 +17,7 @@
 #include <string>
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using ModuleQConfigMap = std::unordered_map<ModulePtr, std::optional<QConfig>>;
 
@@ -1720,5 +1719,4 @@ Module InsertObserversForOnDevicePTQ(
       cloned_module, observer_method_name, /* is_entry_point */ true);
   return cloned_module;
 }
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.h b/torch/csrc/jit/passes/quantization/insert_observers.h
index e8857318261c8..7dbac9cfca670 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.h
+++ b/torch/csrc/jit/passes/quantization/insert_observers.h
@@ -14,8 +14,7 @@ struct hash<torch::jit::Module> {
 
 } // namespace std
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using QConfig = std::tuple<Module, Module>;
 using QConfigDict = std::unordered_map<std::string, std::optional<QConfig>>;
@@ -64,5 +63,4 @@ TORCH_API Module InsertObserversForOnDevicePTQ(
     bool inplace,
     QuantType quant_type = QuantType::STATIC);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 05c19bdb38a1f..8739c4fcaf424 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -15,8 +15,7 @@
 #include <stack>
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 using graph_rewrite_helper::PatternInfo;
@@ -1841,5 +1840,4 @@ Module InsertQuantDeQuantOnDevicePTQ(
   h.propagateQuantizationOps(module);
   return module;
 }
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.h b/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
index de2b31fdba7ca..9bda42edae413 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
@@ -4,8 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/passes/quantization/quantization_type.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 /** Replicate quantize node for prim::If blocks, so that we can match
  *  quantization patterns in prim::If blocks
@@ -42,5 +41,4 @@ TORCH_API Module InsertQuantDeQuantOnDevicePTQ(
     bool debug,
     QuantType quant_type = QuantType::STATIC);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/quantization_patterns.h b/torch/csrc/jit/passes/quantization/quantization_patterns.h
index 80cf46d7e021e..c5f8e796dca2c 100644
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -10,8 +10,7 @@
 #include <unordered_map>
 #include <utility>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct QuantFusionInfo {
   std::string quantized_op_name;
@@ -1260,5 +1259,4 @@ graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %di
        std::move(conv_transpose2d_with_quant_prepack)}};
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/quantization_type.cpp b/torch/csrc/jit/passes/quantization/quantization_type.cpp
index 66e99c06a5294..290cbd725e79d 100644
--- a/torch/csrc/jit/passes/quantization/quantization_type.cpp
+++ b/torch/csrc/jit/passes/quantization/quantization_type.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/jit/passes/quantization/quantization_type.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 std::ostream& operator<<(std::ostream& os, QuantType t) {
   switch (t) {
@@ -17,5 +16,4 @@ std::ostream& operator<<(std::ostream& os, QuantType t) {
   return os;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/quantization_type.h b/torch/csrc/jit/passes/quantization/quantization_type.h
index ac4afe90ed9ea..1b91854a5e5ca 100644
--- a/torch/csrc/jit/passes/quantization/quantization_type.h
+++ b/torch/csrc/jit/passes/quantization/quantization_type.h
@@ -2,8 +2,7 @@
 #include <cstdint>
 #include <ostream>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Quantization type (dynamic quantization, static quantization).
 // Should match the Python enum in quantize_jit.py
@@ -11,5 +10,4 @@ enum QuantType : std::uint8_t { DYNAMIC = 0, STATIC };
 
 std::ostream& operator<<(std::ostream& os, QuantType t);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/register_packed_params.cpp b/torch/csrc/jit/passes/quantization/register_packed_params.cpp
index c3696cdc5109c..589aedea3d8c3 100644
--- a/torch/csrc/jit/passes/quantization/register_packed_params.cpp
+++ b/torch/csrc/jit/passes/quantization/register_packed_params.cpp
@@ -7,8 +7,7 @@
 #include <torch/csrc/jit/passes/quantization/helper.h>
 #include <torch/csrc/jit/passes/quantization/register_packed_params.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 namespace {
 bool isPrepackNode(Node* n) {
@@ -144,5 +143,4 @@ std::unordered_set<std::string> RegisterPrePackParams(
   return packed_param_names;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/quantization/register_packed_params.h b/torch/csrc/jit/passes/quantization/register_packed_params.h
index c1cbf1b27bb32..dcee7144f66f7 100644
--- a/torch/csrc/jit/passes/quantization/register_packed_params.h
+++ b/torch/csrc/jit/passes/quantization/register_packed_params.h
@@ -4,8 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <memory>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 using PrePackParamFilterFn = std::function<bool(Node*)>;
 
@@ -16,5 +15,4 @@ TORCH_API std::unordered_set<std::string> RegisterPrePackParams(
     const std::string& attr_prefix);
 
 TORCH_API std::string joinPaths(const std::vector<std::string>& paths);
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
index 866feb97381ff..7ec05500ded32 100644
--- a/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
+++ b/torch/csrc/jit/passes/utils/check_alias_annotation.cpp
@@ -6,8 +6,7 @@
 
 #include <c10/util/irange.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 namespace {
 
 IValue deepCopy(const IValue& self) {
@@ -305,5 +304,4 @@ void checkAliasAnnotation(
   checkWrites(inputsToCheck, inputsDeepCopy);
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/check_alias_annotation.h b/torch/csrc/jit/passes/utils/check_alias_annotation.h
index df491c8ea3d5a..e227c3bb45602 100644
--- a/torch/csrc/jit/passes/utils/check_alias_annotation.h
+++ b/torch/csrc/jit/passes/utils/check_alias_annotation.h
@@ -6,8 +6,7 @@
 #include <string>
 #include <vector>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Verify that alias annotations are correct. See impl for definition of
 // "correct".
@@ -18,5 +17,4 @@ TORCH_API void checkAliasAnnotation(
     const std::shared_ptr<Graph>& graph,
     std::vector<IValue> pythonInputs,
     const std::string& unqualifiedOpName);
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/memory_dag.cpp b/torch/csrc/jit/passes/utils/memory_dag.cpp
index 3ecbbb8273a4a..8ad213082f52f 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.cpp
+++ b/torch/csrc/jit/passes/utils/memory_dag.cpp
@@ -4,8 +4,7 @@
 #include <algorithm>
 #include <queue>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 namespace {
 
 void makePointerToImpl(Element* from, Element* to) {
@@ -232,5 +231,4 @@ void MemoryDAG::setWildcards(
 Element* MemoryDAG::unsafeMakeFreshValue(const Value* v) {
   return makeFreshValueImpl(v, indexToElementMap_);
 }
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/memory_dag.h b/torch/csrc/jit/passes/utils/memory_dag.h
index 1d2292fe90c5b..dc6d5b24a09fe 100644
--- a/torch/csrc/jit/passes/utils/memory_dag.h
+++ b/torch/csrc/jit/passes/utils/memory_dag.h
@@ -16,8 +16,7 @@
 
 // Uses a compressed index representation for faster comparisons
 typedef c10::SparseBitVector<256> MemoryLocations;
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct Value;
 
@@ -172,5 +171,4 @@ class TORCH_API MemoryDAGBuilder {
   // the map to construct the `MemoryDAG`
   std::vector<std::unique_ptr<Element>> indexToElementMap_;
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/op_registry.cpp b/torch/csrc/jit/passes/utils/op_registry.cpp
index 5d4d9ce4a334d..2538c90b4575c 100644
--- a/torch/csrc/jit/passes/utils/op_registry.cpp
+++ b/torch/csrc/jit/passes/utils/op_registry.cpp
@@ -2,8 +2,7 @@
 
 // Location for Commonly Used Shape registries
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Requirements:
 //   dims           : preserved from the first argument
@@ -72,5 +71,4 @@ std::shared_ptr<OperatorSet> ops_one_tensor_in_shape_transform() {
   });
   return ops;
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/op_registry.h b/torch/csrc/jit/passes/utils/op_registry.h
index d68d1d6192d6c..85d9ac8c7d287 100644
--- a/torch/csrc/jit/passes/utils/op_registry.h
+++ b/torch/csrc/jit/passes/utils/op_registry.h
@@ -4,8 +4,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <memory>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 // Moved from shape_analysis.cpp
 
 // Requirements:
@@ -27,5 +26,4 @@ std::shared_ptr<OperatorSet> nn_ops_first_input_preserving();
 //   tensor inputs  : 1
 //   tensor outputs : 1
 std::shared_ptr<OperatorSet> ops_one_tensor_in_shape_transform();
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/optimization_utils.cpp b/torch/csrc/jit/passes/utils/optimization_utils.cpp
index 2e2eb8299fdc6..e5c25f8a0a26b 100644
--- a/torch/csrc/jit/passes/utils/optimization_utils.cpp
+++ b/torch/csrc/jit/passes/utils/optimization_utils.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/jit/passes/utils/optimization_utils.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 bool nonConstantParameters(Node* n) {
   // Checks if the parameters, not including the
@@ -14,5 +13,4 @@ bool nonConstantParameters(Node* n) {
   return false;
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/optimization_utils.h b/torch/csrc/jit/passes/utils/optimization_utils.h
index 6018fbea6daa9..720523ede4ccf 100644
--- a/torch/csrc/jit/passes/utils/optimization_utils.h
+++ b/torch/csrc/jit/passes/utils/optimization_utils.h
@@ -3,12 +3,10 @@
 
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 // Checks if the parameters, not including the
 // first param are all constants.
 bool nonConstantParameters(Node* n);
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index 0cc07a18c05eb..8fd18e4717e28 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -9,9 +9,7 @@
 
 #include <utility>
 
-namespace torch {
-namespace jit {
-namespace SubgraphUtils {
+namespace torch::jit::SubgraphUtils {
 namespace {
 
 bool hasSubgraph(Node* n) {
@@ -633,6 +631,4 @@ std::string generateNameForGraph(
   return truncateStrWithHash(graph_name.str(), maxlen);
 }
 
-} // namespace SubgraphUtils
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::SubgraphUtils
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h
index dd761409ca2d0..fc5ba3e415ee9 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.h
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -4,14 +4,11 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir.h>
 
-namespace torch {
-namespace jit {
-
 // Utilities for dealing with nodes that contain subgraphs.
 //
 // They handle the complexity of editing inputs/outputs as you merge nodes in
 // and out of subgraphs.
-namespace SubgraphUtils {
+namespace torch::jit::SubgraphUtils {
 
 // Create a new subgraph node that contains only `n`. The new subgraph will have
 // `subgraphKind` as its type.
@@ -70,6 +67,4 @@ TORCH_API std::string generateNameForGraph(
     size_t maxlen = 40,
     const std::string& prefix = "fused");
 
-} // namespace SubgraphUtils
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::SubgraphUtils
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.h b/torch/csrc/jit/tensorexpr/bounds_inference.h
index 300cb89a788f5..67fff99dec791 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.h
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.h
@@ -6,9 +6,7 @@
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 class Expr;
 class Buf;
@@ -74,6 +72,4 @@ TORCH_API bool isOverlapping(
     const StorePtr& S,
     const LoadPtr& L);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.h b/torch/csrc/jit/tensorexpr/bounds_overlap.h
index 5cc502cdecd32..0dbb69727875a 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.h
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.h
@@ -6,10 +6,7 @@
 #include <utility>
 #include <vector>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-namespace analysis {
+namespace torch::jit::tensorexpr::analysis {
 
 // A simple class containing the start and end of a range in a single dimension.
 struct TORCH_API Bound {
@@ -121,7 +118,4 @@ std::vector<IndexBounds> TORCH_API subtractIndicesBounds(
 std::vector<IndexBounds> TORCH_API
 subtractIndicesBounds(const IndexBounds& A, const IndexBounds& B);
 
-} // namespace analysis
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr::analysis
diff --git a/torch/csrc/jit/tensorexpr/cpp_intrinsics.h b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
index 3149335ea30f9..0e4bb6a615254 100644
--- a/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
+++ b/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
@@ -1,8 +1,6 @@
 #pragma once
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 constexpr auto cpp_intrinsics_definition = R"(
 namespace std {
@@ -31,6 +29,4 @@ To bitcast(const From& v) {
 } // namespace std
 )";
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/cuda_random.h b/torch/csrc/jit/tensorexpr/cuda_random.h
index 987ac5211d929..ce59bba11e877 100644
--- a/torch/csrc/jit/tensorexpr/cuda_random.h
+++ b/torch/csrc/jit/tensorexpr/cuda_random.h
@@ -1,8 +1,6 @@
 #pragma once
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 constexpr auto philox_random_string = R"(
 
@@ -99,6 +97,4 @@ __device__  __inline__ float Uint32ToFloat(unsigned int x) {
 
 )";
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/fwd_decls.h b/torch/csrc/jit/tensorexpr/fwd_decls.h
index 84c34a278a099..d0a4acbc3169c 100644
--- a/torch/csrc/jit/tensorexpr/fwd_decls.h
+++ b/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -2,9 +2,7 @@
 #include <c10/core/ScalarType.h>
 #include <memory>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 template <typename Node>
 using NodePtr = std::shared_ptr<Node>;
@@ -124,6 +122,4 @@ using SyncThreadsPtr = NodePtr<SyncThreads>;
 AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
 #undef IMM_DECLARE
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index 6afd053c8c42c..8360fb950fa2f 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -11,9 +11,7 @@
 
 #include <ATen/core/ivalue.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 enum CompareSelectOperation {
   kEQ = 0,
@@ -918,6 +916,4 @@ TORCH_API ExprPtr flatten_index(
     const std::vector<ExprPtr>& indices,
     const std::vector<ExprPtr>& strides);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/ir_cloner.h b/torch/csrc/jit/tensorexpr/ir_cloner.h
index 3336fb0dc59fa..dd626eeb4c9d9 100644
--- a/torch/csrc/jit/tensorexpr/ir_cloner.h
+++ b/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -5,9 +5,7 @@
 
 #include <torch/csrc/jit/tensorexpr/ir_mutator.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 class TORCH_API IRCloner : public IRMutator {
  public:
@@ -61,6 +59,4 @@ class TORCH_API IRCloner : public IRMutator {
   StmtPtr mutate(const CondPtr& v) override;
 };
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.h b/torch/csrc/jit/tensorexpr/ir_verifier.h
index 020c01a23340e..e8e887ac80aed 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.h
@@ -3,9 +3,7 @@
 #include <torch/csrc/jit/tensorexpr/fwd_decls.h>
 #include <torch/csrc/jit/tensorexpr/ir_visitor.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 class Expr;
 class ExprHandle;
@@ -53,6 +51,4 @@ TORCH_API void verify(const StmtPtr&);
 TORCH_API void verify(const ExprPtr&);
 TORCH_API void verify(const ExprHandle&);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/conv2d.h b/torch/csrc/jit/tensorexpr/operators/conv2d.h
index f842a1350a551..9aa328d98b6db 100644
--- a/torch/csrc/jit/tensorexpr/operators/conv2d.h
+++ b/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -3,9 +3,7 @@
 #include <torch/csrc/jit/tensorexpr/operators/misc.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 // An API to compute 2D depthwise convolutions with bias.
 TORCH_API Tensor conv2d_depthwise(
@@ -100,6 +98,4 @@ Tensor computeMkldnnPrepackedConvRun(
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
     at::Device device);
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/matmul.h b/torch/csrc/jit/tensorexpr/operators/matmul.h
index 40ef3cfd9b619..d572a1c396c0e 100644
--- a/torch/csrc/jit/tensorexpr/operators/matmul.h
+++ b/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 Tensor computeMatmul(
     const std::vector<ArgValue>& inputs,
@@ -19,6 +17,4 @@ Tensor computeAddMM(
     const std::optional<ScalarType>& outputType,
     at::Device device);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/norm.h b/torch/csrc/jit/tensorexpr/operators/norm.h
index dbe6140cca8b4..e531943237b09 100644
--- a/torch/csrc/jit/tensorexpr/operators/norm.h
+++ b/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 Tensor computeBatchNorm(
     const std::vector<ArgValue>& inputs,
@@ -13,6 +11,4 @@ Tensor computeBatchNorm(
     const std::optional<ScalarType>& outputType,
     at::Device device);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/pointwise.h b/torch/csrc/jit/tensorexpr/operators/pointwise.h
index 1e3366a285876..8f8f6240d1984 100644
--- a/torch/csrc/jit/tensorexpr/operators/pointwise.h
+++ b/torch/csrc/jit/tensorexpr/operators/pointwise.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 TORCH_API Tensor computeSign(
     const std::vector<ArgValue>& inputs,
@@ -81,6 +79,4 @@ Tensor computeScalar(
     const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
         innerExpr);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h
index d48c9e3273ba0..51bdbe730a6a0 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.h
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 TORCH_API ExprHandle quantizePerTensorQParamFromArg(ArgValue arg);
 
@@ -155,6 +153,4 @@ TORCH_API Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
     at::Device);
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/reduction.h b/torch/csrc/jit/tensorexpr/operators/reduction.h
index 7d25e14a171ce..615d75c397c92 100644
--- a/torch/csrc/jit/tensorexpr/operators/reduction.h
+++ b/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 TORCH_API Tensor computeSum(
     const std::vector<ArgValue>& inputs,
@@ -31,6 +29,4 @@ Tensor computeMax(
     const std::optional<ScalarType>& outputType,
     at::Device device);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/operators/softmax.h b/torch/csrc/jit/tensorexpr/operators/softmax.h
index d5dd7fd429bed..f2a5698673cf3 100644
--- a/torch/csrc/jit/tensorexpr/operators/softmax.h
+++ b/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -2,9 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 
-namespace torch {
-namespace jit {
-namespace tensorexpr {
+namespace torch::jit::tensorexpr {
 
 Tensor computeSoftmax(
     const std::vector<ArgValue>& inputs,
@@ -12,6 +10,4 @@ Tensor computeSoftmax(
     const std::vector<ExprHandle>& outputStrides,
     bool log_softmax);
 
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index 97273ef4a110c..d6af1d2a1e388 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -23,10 +23,7 @@
 #include <sstream>
 #include <string>
 
-namespace torch {
-namespace jit {
-
-namespace testing {
+namespace torch::jit::testing {
 
 enum CheckType {
   CHECK,
@@ -633,6 +630,4 @@ FileCheck* FileCheck::check_regex(const std::string& str) {
   return this;
 }
 
-} // namespace testing
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit::testing
diff --git a/torch/csrc/jit/testing/file_check.h b/torch/csrc/jit/testing/file_check.h
index 6e9290f5130ba..fd09fcc6ad30b 100644
--- a/torch/csrc/jit/testing/file_check.h
+++ b/torch/csrc/jit/testing/file_check.h
@@ -4,8 +4,7 @@
 #include <memory>
 #include <string>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 struct Graph;
 
@@ -77,5 +76,4 @@ struct FileCheck {
   std::unique_ptr<FileCheckImpl> fcImpl;
 };
 } // namespace testing
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/testing/hooks_for_testing.cpp b/torch/csrc/jit/testing/hooks_for_testing.cpp
index 553938afd77c3..d23da57c74c9c 100644
--- a/torch/csrc/jit/testing/hooks_for_testing.cpp
+++ b/torch/csrc/jit/testing/hooks_for_testing.cpp
@@ -2,8 +2,7 @@
 
 #include <torch/csrc/jit/api/module.h>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 
 static ModuleHook emit_module_callback;
 void didFinishEmitModule(Module module) {
@@ -28,5 +27,4 @@ std::pair<ModuleHook, FunctionHook> getEmitHooks() {
   return std::make_pair(emit_module_callback, emit_function_callback);
 }
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
diff --git a/torch/csrc/jit/testing/hooks_for_testing.h b/torch/csrc/jit/testing/hooks_for_testing.h
index 108dea3f1f72d..5613a0d24476d 100644
--- a/torch/csrc/jit/testing/hooks_for_testing.h
+++ b/torch/csrc/jit/testing/hooks_for_testing.h
@@ -4,8 +4,7 @@
 #include <functional>
 #include <memory>
 
-namespace torch {
-namespace jit {
+namespace torch::jit {
 struct Module;
 
 using ModuleHook = std::function<void(Module module)>;
@@ -17,5 +16,4 @@ TORCH_API void setEmitHooks(ModuleHook for_module, FunctionHook for_fn);
 
 TORCH_API std::pair<ModuleHook, FunctionHook> getEmitHooks();
 
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit

From e78c4ded489044b4da060fea259b4bf3ef96faea Mon Sep 17 00:00:00 2001
From: Taras <taras@janeasystems.com>
Date: Sat, 26 Oct 2024 17:41:37 +0000
Subject: [PATCH 123/161] Use the unicode variant of the Windows API (#47422)
 (#138605)

Use the unicode variant of the Windows API in c10/util/Backtrace.cpp
- #47422

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138605
Approved by: https://github.com/peterjc123, https://github.com/malfet
---
 c10/util/Backtrace.cpp | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index bfcacfd9740d1..8838cafb029e4 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #ifdef _MSC_VER
+#include <c10/util/Unicode.h>
 #include <c10/util/win32-headers.h>
 #include <iomanip>
 #pragma comment(lib, "Dbghelp.lib")
@@ -289,27 +290,31 @@ class GetBacktraceImpl {
 #elif defined(_MSC_VER) // !SUPPORTS_BACKTRACE
 
 const int max_name_len = 256;
-std::string get_module_base_name(void* addr) {
+std::wstring get_module_base_name(void* addr) {
   HMODULE h_module;
-  char module[max_name_len];
-  strcpy(module, "");
-  GetModuleHandleEx(
+  wchar_t module[max_name_len];
+  wcscpy(module, L"");
+
+  GetModuleHandleExW(
       GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
           GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
-      (LPCTSTR)addr,
+      (LPCWSTR)addr,
       &h_module);
+
   if (h_module != NULL) {
-    GetModuleFileNameA(h_module, module, max_name_len);
+    GetModuleFileNameW(h_module, module, max_name_len);
   }
-  char* last_slash_pos = strrchr(module, '\\');
+
+  wchar_t* last_slash_pos = wcsrchr(module, L'\\');
   if (last_slash_pos) {
-    std::string module_base_name(last_slash_pos + 1);
+    std::wstring module_base_name(last_slash_pos + 1);
     return module_base_name;
   } else {
-    std::string module_base_name(module);
+    std::wstring module_base_name(module);
     return module_base_name;
   }
 }
+
 class SymbolHelper {
  public:
   static SymbolHelper& getInstance() {
@@ -398,7 +403,8 @@ class GetBacktraceImpl {
       }
 
       // Get the module basename
-      std::string module = get_module_base_name(back_trace_[i_frame]);
+      std::string module =
+          c10::u16u8(get_module_base_name(back_trace_[i_frame]));
 
       // The pattern on Windows is
       // `<return-address> <symbol-address>

From 1d83a893c530cb46eb3772dd39ae5609aa46ca93 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Sat, 26 Oct 2024 17:42:05 +0000
Subject: [PATCH 124/161] [BE][MPS] Use templates in Repeat shader (#138962)

- Instead of generating shader from templated code on host, just define two specializations of one kernel template
- Get rid of unused `threads_per_threadgroup` argument
- Replace `if (typeid(scalar_t) == typeid(int32_t))` with `if constexpr (std::is_same_v<scalar_t, int32_t>)` in the host code

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138962
Approved by: https://github.com/janeyx99
---
 aten/src/ATen/native/mps/operations/Repeat.mm | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 9df8968006870..cbca4caa8a275 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -86,20 +86,27 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
 }
 
 static mps::MetalShaderLibrary lib(R"METAL_REPEAT(
-kernel void repeat_interleave(constant {0}     * repeat_ptr                [[buffer(0)]],
-                              constant int64_t * cumsum_ptr                [[buffer(1)]],
-                              device {0}       * result_ptr                [[buffer(2)]],
-                              uint               threads_per_threadgroup   [[threads_per_threadgroup]],
-                              uint               tid                       [[thread_position_in_grid]]) {{
+template<typename T>
+kernel void repeat_interleave(
+    constant T     * repeat_ptr    [[buffer(0)]],
+    constant int64_t * cumsum_ptr  [[buffer(1)]],
+    device T       * result_ptr    [[buffer(2)]],
+    uint               tid         [[thread_position_in_grid]]) {
   int64_t end = cumsum_ptr[tid];
-  {0} repeat = repeat_ptr[tid];
+  T repeat = repeat_ptr[tid];
   int64_t start = end - repeat;
-  for (uint j = start; j < end; j++) {{
+  for (uint j = start; j < end; j++) {
     result_ptr[j] = tid;
-  }}
-}}
-)METAL_REPEAT",
-                                   1);
+  }
+}
+
+template [[host_name("repeat_interleave_int32_t")]]
+kernel void repeat_interleave<int32_t>(constant int32_t*, constant int64_t*, device int32_t*, uint);
+
+template [[host_name("repeat_interleave_int64_t")]]
+kernel void repeat_interleave<int64_t>(constant int64_t*, constant int64_t*, device int64_t*, uint);
+
+)METAL_REPEAT");
 
 template <typename index_t>
 void computeRepeatIndices(const index_t* repeat_ptr,
@@ -113,9 +120,9 @@ void computeRepeatIndices(const index_t* repeat_ptr,
   TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
 
   std::string scalar_type;
-  if (typeid(index_t) == typeid(int32_t)) {
+  if constexpr (std::is_same_v<index_t, int32_t>) {
     scalar_type = "int32_t";
-  } else if (typeid(index_t) == typeid(int64_t)) {
+  } else if constexpr (std::is_same_v<index_t, int64_t>) {
     scalar_type = "int64_t";
   } else {
     TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
@@ -124,8 +131,8 @@ void computeRepeatIndices(const index_t* repeat_ptr,
   MPSStream* mpsStream = getCurrentMPSStream();
   dispatch_sync(mpsStream->queue(), ^() {
     @autoreleasepool {
-      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      id<MTLComputePipelineState> pipelineState = lib.getPipelineStateForFunc("repeat_interleave", {scalar_type});
+      auto computeEncoder = mpsStream->commandEncoder();
+      auto pipelineState = lib.getPipelineStateForFunc(fmt::format("repeat_interleave_{}", scalar_type));
 
       // this function call is a no-op if MPS Profiler is not enabled
       getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);

From 3a6f0143818a963d654cbb4d0b2613f6368dc127 Mon Sep 17 00:00:00 2001
From: Yifu Wang <yifu@fb.com>
Date: Fri, 25 Oct 2024 14:46:15 -0700
Subject: [PATCH 125/161] [Inductor] improve the stride preservation logic of
 user-visible outputs (#136732)

## Context

Previously, the stride preservation of user-visible nodes worked as follows:

- After joint-graph tracing, we recorded the **names** of user-visible nodes and passed them to GraphLowering.
- In GraphLowering, we determined whether we needed to preserve the striding for a certain node by checking if the node's name was in `user_visible_outputs`.
- We obtained the original strides by checking `node.meta["val"].stride()`.

However, there's a problem with this approach: the nodes in output_node.args[0] and their strides could change between the completion of joint-graph tracing and the consumption of `user_visible_outputs` (e.g., during post-grad passes), making it unreliable.

## This PR

- After joint graph tracing:
  - Record the original strides for all nodes in `output_nodes.args[0]` as `output_node.meta["original_output_strides"]` (recording for all nodes in case we need the info for other purposes such as debugging).
  - Record the indices of user-visible outputs as `output_node.meta["user_visible_output_idxs"]`.
- Remove the original plumbing of `user_visible_outputs`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136732
Approved by: https://github.com/Chillee
---
 .../tensor/parallel/test_micro_pipeline_tp.py |  4 ++
 test/dynamo/test_subclasses.py                |  3 +-
 test/inductor/test_torchinductor.py           | 40 +++++++++++
 torch/_inductor/compile_fx.py                 | 66 +++++++++++--------
 torch/_inductor/graph.py                      | 50 +++++++++-----
 5 files changed, 115 insertions(+), 48 deletions(-)

diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
index 3a4b2eb946fc5..5502116284a30 100644
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -222,6 +222,10 @@ def func(A_shard: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, A_shard, B)
 
+            eager_stride = func(A_shard, B).stride()
+            compiled_stride = compiled(A_shard, B).stride()
+            self.assertEqual(eager_stride, compiled_stride)
+
         if gather_dim == A_dims - 1:
             self.assertNotIn("fused_all_gather_matmul", code)
             self.assertIn("all_gather_into_tensor", code)
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index db3706f90eeab..379ae02aaa686 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -1821,7 +1821,7 @@ def f(x):
 
     @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
     def test_mark_static_with_subclass_desugaring(self):
-        from typing import Any, Callable, Dict, List, Optional
+        from typing import Any, Callable, List, Optional
 
         from torch._dynamo.decorators import mark_static_address
         from torch._inductor.compile_fx import compile_fx
@@ -1843,7 +1843,6 @@ def inner_compile(
             aot_mode: bool = False,
             is_inference: bool = False,
             boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
-            user_visible_outputs: Optional[Dict[str, None]] = None,
             layout_opt: Optional[bool] = None,
             extern_node_serializer: Optional[Callable[[List[Any]], Any]] = None,
         ):
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 73687f41d95a8..b35ca9a645fa0 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -9500,6 +9500,46 @@ def f(x):
             self.assertEqual(out_ref.stride(), out_test.stride())
             self.assertEqual(x_ref, x_test)
 
+    @requires_gpu()
+    def test_stride_preservation_with_stride_modifying_fx_pass(self):
+        def f(x):
+            return x + 1
+
+        def custom_pass(g: torch.fx.Graph) -> None:
+            """
+            Applies `lamda x: x.t().contiguous().t()` to the output.
+            """
+            output_node = g.find_nodes(op="output")[0]
+            assert len(output_node.args) == 1
+            output = output_node.args[0][0]
+
+            with g.inserting_before(output_node):
+                output = g.call_function(
+                    torch.ops.aten.permute.default, args=(output, [1, 0])
+                )
+                output = g.call_function(
+                    torch.ops.aten.clone.default,
+                    args=(output,),
+                    kwargs={"memory_format": torch.contiguous_format},
+                )
+                output = g.call_function(
+                    torch.ops.aten.permute.default, args=(output, [1, 0])
+                )
+            output_node.args = ((output,),)
+            return g
+
+        with config.patch(
+            post_grad_custom_post_pass=custom_pass,
+        ):
+            f_compiled = torch.compile(f)
+
+            x = torch.rand(4, 4, device=GPU_TYPE)
+            y = f(x)
+            y_compiled = f_compiled(x)
+
+            self.assertEqual(y, y_compiled)
+            self.assertEqual(y.stride(), y_compiled.stride())
+
     def test_int_input_dynamic_shapes(self):
         @torch.compile(dynamic=True)
         def fn(x, i):
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 4af7b796091ea..bccbbb38eb918 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -191,6 +191,21 @@ def get_static_input_idxs(num_fixed: int) -> List[int]:
     return fixed + context.fw_metadata.static_input_indices
 
 
+def record_original_output_strides(gm: GraphModule) -> None:
+    output_node = gm.graph.find_nodes(op="output")[0]
+    output_strides = []
+    for output in output_node.args[0]:
+        if (
+            isinstance(output, torch.fx.Node)
+            and (val := output.meta.get("val")) is not None
+            and isinstance(val, torch.Tensor)
+        ):
+            output_strides.append(val.stride())
+        else:
+            output_strides.append(None)
+    output_node.meta["original_output_strides"] = output_strides
+
+
 @functools.lru_cache(None)
 def _step_logger() -> Callable[..., None]:
     return dynamo_logging.get_step_logger(log)
@@ -494,7 +509,6 @@ class _CompileFxKwargs(TypedDict, total=False):
     cpp_wrapper: bool
     aot_mode: bool
     is_inference: bool
-    user_visible_outputs: Optional[Dict[str, None]]
     layout_opt: Optional[bool]
     extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]]
 
@@ -526,7 +540,6 @@ def compile_fx_inner(
     kwargs.setdefault("aot_mode", False)
     kwargs.setdefault("is_inference", False)
     kwargs.setdefault("boxed_forward_device_index", None)
-    kwargs.setdefault("user_visible_outputs", None)
     kwargs.setdefault("layout_opt", None)
     kwargs.setdefault("extern_node_serializer", None)
 
@@ -748,9 +761,6 @@ def fx_codegen_and_compile(
     cpp_wrapper: bool = False,
     aot_mode: bool = False,
     is_inference: bool = False,
-    # Use a dict with None value rather than a set for deterministic
-    # iteration order just in case.
-    user_visible_outputs: Optional[Dict[str, None]] = None,
     layout_opt: Optional[bool] = None,
     extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
 ) -> Union[CompiledFxGraph, str]:
@@ -825,6 +835,8 @@ def log_graph_runnable() -> str:
         with torch.no_grad():
             fake_mode = fake_tensor_prop(gm, example_inputs)
 
+        record_original_output_strides(gm)
+
         # pattern matcher passes might not preserve striding information
         # on node.meta["val"]. if in the future we rely on these being
         # correct we will need to fix.
@@ -873,7 +885,6 @@ def log_graph_runnable() -> str:
                     graph_id=graph_id,
                     cpp_wrapper=cpp_wrapper,
                     aot_mode=aot_mode,
-                    user_visible_outputs=user_visible_outputs,
                     extern_node_serializer=extern_node_serializer,
                     is_inference=is_inference,
                     is_backward=is_backward,
@@ -895,7 +906,6 @@ def log_graph_runnable() -> str:
                 graph_id=graph_id,
                 cpp_wrapper=cpp_wrapper,
                 aot_mode=aot_mode,
-                user_visible_outputs=user_visible_outputs,
                 extern_node_serializer=extern_node_serializer,
                 is_inference=is_inference,
                 is_backward=is_backward,
@@ -1258,9 +1268,9 @@ def fw_compiler_freezing(
     # for freezing, all graph outputs should be user visible
     *_, model_outputs_node = opt_model.graph.nodes
     model_outputs = model_outputs_node.args[0]
-    user_visible_outputs = dict.fromkeys(
-        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
-    )
+    model_outputs_node.meta["user_visible_output_idxs"] = [
+        idx for idx, n in enumerate(model_outputs) if isinstance(n, torch.fx.Node)
+    ]
 
     static_input_idxs = list(range(num_fixed))
     wrapper_new_args_unwrapped_indices: List[int] = []
@@ -1307,7 +1317,6 @@ def fw_compiler_freezing(
             is_inference=True,
             boxed_forward_device_index=forward_device,
             layout_opt=layout_opt,
-            user_visible_outputs=user_visible_outputs,
         )
 
     # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper
@@ -1493,10 +1502,8 @@ def _fw_compiler_base(
                 num_example_inputs, len(example_inputs)
             )
 
-            user_visible_outputs = {}
-
+            model_outputs_node = output_node(model)
             if config.keep_output_stride:
-                model_outputs_node = output_node(model)
                 model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
                 num_model_outputs = len(model_outputs)
 
@@ -1541,13 +1548,13 @@ def _fw_compiler_base(
                 # of "graph" outputs. Make sure we're within bounds.
                 assert orig_output_end_idx <= num_model_outputs
 
-                user_visible_outputs = dict.fromkeys(
-                    n.name
-                    for n in model_outputs[
-                        original_output_start_index:orig_output_end_idx
-                    ]
-                    if isinstance(n, torch.fx.Node)
-                )
+                model_outputs_node.meta["user_visible_output_idxs"] = [
+                    idx
+                    for idx in range(original_output_start_index, orig_output_end_idx)
+                    if isinstance(model_outputs[idx], torch.fx.Node)
+                ]
+            else:
+                model_outputs_node.meta["user_visible_output_idxs"] = []
 
             return inner_compile(
                 model,
@@ -1557,7 +1564,6 @@ def _fw_compiler_base(
                 graph_id=graph_id,
                 is_inference=is_inference,
                 boxed_forward_device_index=forward_device,
-                user_visible_outputs=user_visible_outputs,
             )
 
         fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
@@ -1592,14 +1598,17 @@ def bw_compiler(
             model: GraphModule, example_inputs: List[InputType]
         ) -> Union[CompiledFxGraph, str]:
             with dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"):
-                user_visible_outputs = {}
-
+                model_outputs_node = output_node(model)
                 if config.bw_outputs_user_visible:
-                    model_outputs_node = output_node(model)
                     model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
-                    user_visible_outputs = dict.fromkeys(
-                        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
-                    )
+                    model_outputs_node.meta["user_visible_output_idxs"] = [
+                        idx
+                        for idx, n in enumerate(model_outputs)
+                        if isinstance(n, torch.fx.Node)
+                    ]
+                else:
+                    model_outputs_node.meta["user_visible_output_idxs"] = []
+
                 fixed = count_tangents(model)
                 with config.patch(
                     get_cpp_wrapper_config()
@@ -1612,7 +1621,6 @@ def bw_compiler(
                         is_backward=True,
                         graph_id=graph_id,
                         boxed_forward_device_index=forward_device,
-                        user_visible_outputs=user_visible_outputs,
                     )
 
         # TODO: can add logging before/after the call to create_aot_dispatcher_function
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 5f2848a98d1ce..9179de5a5f618 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -193,9 +193,17 @@ def getattr_recursive(
     return attr_itr
 
 
-def mark_nodes_dislike_padding(
-    g: Graph, user_visible_outputs: Optional[Dict[str, None]]
-) -> None:
+def is_user_visible_output(node: torch.fx.Node) -> bool:
+    output_node = node.graph.find_nodes(op="output")[0]
+    return (
+        "user_visible_output_idxs" in output_node.meta
+        and node in output_node.args[0]
+        and output_node.args[0].index(node)
+        in output_node.meta["user_visible_output_idxs"]
+    )
+
+
+def mark_nodes_dislike_padding(g: Graph) -> None:
     """
     Nodes like convolution/convolution_backward want its input to be dense.
     If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.
@@ -238,6 +246,8 @@ def _get_overload_packet(
             else None
         )
 
+    output_node = g.find_nodes(op="output")[0]
+
     for cur in reversed(g.nodes):
         op = _get_overload_packet(cur)
         if not op:
@@ -254,11 +264,7 @@ def _get_overload_packet(
                 if prior_op not in ops_like_padding:
                     prior.meta["dislike_padding"] = True
         # We only want to mark output nodes. So, move it after the above prior nodes process.
-        if (
-            not config.pad_outputs
-            and user_visible_outputs
-            and cur.name in user_visible_outputs
-        ):
+        if not config.pad_outputs and is_user_visible_output(cur):
             cur.meta["dislike_padding"] = True
 
 
@@ -320,7 +326,6 @@ def __init__(
         graph_id: Optional[int] = None,
         cpp_wrapper: bool = False,
         aot_mode: bool = False,
-        user_visible_outputs: Optional[Dict[str, None]] = None,
         layout_opt: Optional[bool] = None,
         extern_node_serializer: Optional[
             Callable[[List[ir.ExternKernelNode]], Any]
@@ -440,10 +445,7 @@ def __init__(
             self.find_nodes_prefer_channels_last() if self.layout_opt else OrderedSet()
         )
         self._warned_fallback = {"aten.convolution_backward"}
-        self.user_visible_outputs = (
-            user_visible_outputs if user_visible_outputs is not None else {}
-        )
-        mark_nodes_dislike_padding(gm.graph, user_visible_outputs)
+        mark_nodes_dislike_padding(gm.graph)
         self.cache_key: str = ""  # This is the cache key for the compiled artifact
         self.cache_path: str = ""  # This is the path in the filesystem where the compiled artifact is stored
         self.cache_linemap: List[
@@ -1424,6 +1426,7 @@ def debug(msg: str) -> None:
                 torch.ops.aten.resize_as.default,
             ]
             is_output = any(user.op == "output" for user in n.users)
+            is_user_visible = is_user_visible_output(n)
             is_input_for_as_strided = any(
                 user.target in as_strided_ops for user in n.users
             )
@@ -1452,10 +1455,23 @@ def debug(msg: str) -> None:
             if (is_output or is_input_for_as_strided) and isinstance(
                 n.meta["val"], torch.Tensor
             ):
-                strides = n.meta["val"].stride()
-                if len(strides):
+                if is_user_visible:
+                    output_node = n.graph.find_nodes(op="output")[0]
+                    output_idx = output_node.args[0].index(n)
+                    original_output_strides = output_node.meta.get(
+                        "original_output_strides"
+                    )
+                    strides = (
+                        original_output_strides[output_idx]
+                        if original_output_strides is not None
+                        else None
+                    )
+                else:
+                    strides = n.meta["val"].stride()
+
+                if strides is not None and len(strides) > 0:
                     allow_padding = (
-                        config.pad_outputs or n.name not in self.user_visible_outputs
+                        config.pad_outputs or not is_user_visible
                     ) and not is_input_for_as_strided
                     dense = torch._prims_common.is_non_overlapping_and_dense(
                         n.meta["val"]
@@ -1468,7 +1484,7 @@ def debug(msg: str) -> None:
                         and dense
                         and len(result.get_size()) == 4
                         and n in self.nodes_prefer_channels_last
-                        and n.name not in self.user_visible_outputs
+                        and not is_user_visible
                         and not is_input_for_as_strided
                     ):
                         strides = ir.FlexibleLayout.stride_ordered_for_memory_format(

From fb36daac9fe3085535da4ce128dbda50dc4f1825 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sat, 26 Oct 2024 19:09:45 +0000
Subject: [PATCH 126/161] [7/N] Fix extra warnings brought by clang-tidy-17
 (#138972)

Fix extra warnings brought by clang-tidy-17

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138972
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/Parallel.h                         | 4 ++--
 aten/src/ATen/ParallelFuture.h                   | 2 +-
 aten/src/ATen/ParallelNative.cpp                 | 6 +++---
 aten/src/ATen/ParallelOpenMP.cpp                 | 9 +++++----
 aten/src/ATen/ParallelThreadPoolNative.cpp       | 4 ++--
 aten/src/ATen/core/TensorBase.h                  | 5 +++--
 aten/src/ATen/detail/PrivateUse1HooksInterface.h | 6 +++---
 torch/csrc/Generator.cpp                         | 2 +-
 torch/csrc/PyInterpreter.cpp                     | 6 ++----
 torch/csrc/autograd/VariableTypeUtils.h          | 2 +-
 torch/csrc/autograd/python_legacy_variable.cpp   | 2 +-
 torch/csrc/autograd/python_variable_indexing.cpp | 2 +-
 torch/csrc/utils.cpp                             | 2 +-
 13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 966e29c0289f3..917524419f9a7 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -133,7 +133,7 @@ TORCH_API std::string get_parallel_info();
 TORCH_API void set_num_interop_threads(int);
 
 // Returns the number of threads used for inter-op parallelism
-TORCH_API int get_num_interop_threads();
+TORCH_API size_t get_num_interop_threads();
 
 // Launches inter-op parallel task
 TORCH_API void launch(std::function<void()> func);
@@ -142,7 +142,7 @@ void launch_no_thread_state(std::function<void()> fn);
 } // namespace internal
 
 // Launches intra-op parallel task
-TORCH_API void intraop_launch(std::function<void()> func);
+TORCH_API void intraop_launch(const std::function<void()>& func);
 
 // Returns number of intra-op threads used by default
 TORCH_API int intraop_default_num_threads();
diff --git a/aten/src/ATen/ParallelFuture.h b/aten/src/ATen/ParallelFuture.h
index 042cd92da1934..7b459036ce6d7 100644
--- a/aten/src/ATen/ParallelFuture.h
+++ b/aten/src/ATen/ParallelFuture.h
@@ -8,6 +8,6 @@ namespace at {
 
 // Launches intra-op parallel task, returns a future
 TORCH_API c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
-    std::function<void()> func);
+    const std::function<void()>& func);
 
 } // namespace at
diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index a2e1992650009..5edd9da05994a 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -273,10 +273,10 @@ bool in_parallel_region() {
 #endif // C10_MOBILE
 }
 
-void intraop_launch(std::function<void()> func) {
+void intraop_launch(const std::function<void()>& func) {
 #ifndef C10_MOBILE
   if (!in_parallel_region() && get_num_threads() > 1) {
-    _get_intraop_pool().run(std::move(func));
+    _get_intraop_pool().run(func);
   } else {
     // execute inline if we're in parallel region
     func();
@@ -289,7 +289,7 @@ void intraop_launch(std::function<void()> func) {
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
-    std::function<void()> func) {
+    const std::function<void()>& func) {
 #ifndef C10_MOBILE
   auto future = c10::make_intrusive<c10::ivalue::Future>(c10::NoneType::get());
   if (!in_parallel_region() && get_num_threads() > 1) {
diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp
index 1c128bfc3b28d..388cbb1a4b9f9 100644
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@@ -14,9 +14,10 @@
 
 namespace at {
 #if AT_MKLDNN_ENABLED()
-namespace native { namespace mkldnn {
+namespace native::mkldnn {
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 void clear_computation_cache();
-}} // namespace native::mkldnn
+} // namespace native::mkldnn
 #endif
 
 namespace {
@@ -100,13 +101,13 @@ bool in_parallel_region() {
 #endif
 }
 
-void intraop_launch(std::function<void()> func) {
+void intraop_launch(const std::function<void()>& func) {
   // execute inline in openmp case
   func();
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
-    std::function<void()> func) {
+    const std::function<void()>& func) {
   func();
   auto future = c10::make_intrusive<c10::ivalue::Future>(NoneType::get());
   future->markCompleted();
diff --git a/aten/src/ATen/ParallelThreadPoolNative.cpp b/aten/src/ATen/ParallelThreadPoolNative.cpp
index 348dabdacde33..75dd56c263eb7 100644
--- a/aten/src/ATen/ParallelThreadPoolNative.cpp
+++ b/aten/src/ATen/ParallelThreadPoolNative.cpp
@@ -56,7 +56,7 @@ void set_num_interop_threads(int nthreads) {
       "has started or set_num_interop_threads called");
 }
 
-int get_num_interop_threads() {
+size_t get_num_interop_threads() {
   at::internal::lazy_init_num_threads();
   int nthreads = num_interop_threads.load();
   if (nthreads > 0) {
@@ -82,7 +82,7 @@ void launch_no_thread_state(std::function<void()> fn) {
 void launch(std::function<void()> func) {
   // NOLINTNEXTLINE(modernize-avoid-bind)
   internal::launch_no_thread_state(std::bind([](
-    std::function<void()> f, ThreadLocalState thread_locals) {
+    const std::function<void()>& f, const ThreadLocalState& thread_locals) {
       ThreadLocalStateGuard guard(thread_locals);
       f();
     },
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 2d202a63efa75..4b278ff27700d 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -104,6 +104,7 @@ class TORCH_API TensorBase {
   }
   TensorBase(const TensorBase&) = default;
   TensorBase(TensorBase&&) noexcept = default;
+  ~TensorBase() noexcept = default;
 
  public:
   // Creates a new wrapper from TensorImpl. Intentionally a free method because
@@ -625,7 +626,7 @@ class TORCH_API TensorBase {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
     TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
     T* ptr = nullptr;
-    if constexpr (std::is_const<T>::value) {
+    if constexpr (std::is_const_v<T>) {
       ptr = const_data_ptr<T>();
     } else {
       ptr = mutable_data_ptr<T>();
@@ -645,7 +646,7 @@ class TORCH_API TensorBase {
     static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
     TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
     T* ptr = nullptr;
-    if constexpr (std::is_const<T>::value) {
+    if constexpr (std::is_const_v<T>) {
       ptr = const_data_ptr<T>();
     } else {
       ptr = mutable_data_ptr<T>();
diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
index 3820c960dfe57..bb656e0bb4ad5 100644
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@@ -24,17 +24,17 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
   }
 
-  virtual bool isPinnedPtr(const void* data) const override {
+  bool isPinnedPtr(const void* data) const override {
     return false;
   }
 
-  virtual Allocator* getPinnedMemoryAllocator() const override {
+  Allocator* getPinnedMemoryAllocator() const override {
     TORCH_CHECK(
         false,
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
   }
 
-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 1da0a3229db0a..c36d9071bbd79 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -98,7 +98,7 @@ static PyObject* THPGenerator_getState(PyObject* _self, PyObject* noargs) {
   std::scoped_lock<std::mutex> lock(gen.mutex());
   auto state_tensor = gen.get_state();
 
-  return THPVariable_Wrap(std::move(state_tensor));
+  return THPVariable_Wrap(state_tensor);
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 0eb41434b2b62..57410c5cadffc 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -198,8 +198,7 @@ py::object torchDispatchFromTensorImpl(
       c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
-  auto self_p =
-      py::reinterpret_steal<py::object>(THPVariable_Wrap(std::move(self_t)));
+  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
   // NB: this may not be a python tensor if you got here from a mode!
   // TORCH_INTERNAL_ASSERT(isPythonTensor(self_t));
   append_overloaded_tensor(&overloaded_args, self_p.ptr());
@@ -940,8 +939,7 @@ void ConcretePyInterpreterVTable::reset_backward_hooks(
       Tensor(c10::intrusive_ptr<c10::TensorImpl, c10::UndefinedTensorImpl>::
                  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
              unsafe_reclaim_from_nonowning(const_cast<c10::TensorImpl*>(self)));
-  auto self_p =
-      py::reinterpret_steal<py::object>(THPVariable_Wrap(std::move(self_t)));
+  auto self_p = py::reinterpret_steal<py::object>(THPVariable_Wrap(self_t));
   PyObject_SetAttrString(self_p.ptr(), "_backward_hooks", Py_None);
   END_HANDLE_TH_ERRORS_PYBIND
 }
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index e6aebfafb1adc..73d5d1c13a543 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -398,7 +398,7 @@ namespace {
 // call in this functor so it can be passed to c10::BoxedKernel::makeFromFunctor
 class WrapperFunctor final : public c10::OperatorKernel {
  public:
-  WrapperFunctor(JitDecompInterface* impl) : impl_(impl){};
+  WrapperFunctor(JitDecompInterface* impl) : impl_(impl) {}
 
   void operator()(
       const c10::OperatorHandle& op,
diff --git a/torch/csrc/autograd/python_legacy_variable.cpp b/torch/csrc/autograd/python_legacy_variable.cpp
index 0647abc69e090..3c6e9378f55d3 100644
--- a/torch/csrc/autograd/python_legacy_variable.cpp
+++ b/torch/csrc/autograd/python_legacy_variable.cpp
@@ -104,7 +104,7 @@ static PyObject* THPVariable_pynew(
     }
   }
 
-  return THPVariable_Wrap(std::move(var));
+  return THPVariable_Wrap(var);
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 308ca0d58213c..059db01d49cac 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -397,7 +397,7 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
       // ensure we return a shallow copy for things like x[...]
       sliced = at::alias(sliced);
     }
-    return THPVariable_Wrap(std::move(sliced));
+    return THPVariable_Wrap(sliced);
   }
 
   // indexing by tensors ("advanced" indexing)
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index 0663fd48cd9fb..fceb90933d8de 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -257,7 +257,7 @@ namespace torch::gdb {
 // call free than delete[] from withing gdb.
 // Currently the code for computing the repr of a tensor is written in Python,
 // so we need to wrap the Tensor into a Python object first.
-char* tensor_repr(at::Tensor tensor) {
+char* tensor_repr(const at::Tensor& tensor) {
   PyGILState_STATE gil = PyGILState_Ensure();
   PyObject* pytensor = nullptr;
   PyObject* repr = nullptr;

From 42994234a6e27a94b5f50d67f503ce10b30a9b31 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Sat, 26 Oct 2024 20:59:22 +0000
Subject: [PATCH 127/161] std::value/std::type -> std::_v/std::_t (#138746)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138746
Approved by: https://github.com/cyyever, https://github.com/malfet
---
 aten/src/ATen/core/Dict.h                     |  2 +-
 aten/src/ATen/core/Dict_inl.h                 | 16 ++---
 aten/src/ATen/core/DistributionsHelper.h      |  8 +--
 aten/src/ATen/core/List_inl.h                 |  8 +--
 aten/src/ATen/core/Tensor.h                   |  2 +-
 aten/src/ATen/core/blob.h                     |  2 +-
 aten/src/ATen/core/boxing/BoxedKernel_impl.h  |  2 +-
 .../ATen/core/boxing/KernelFunction_impl.h    |  6 +-
 aten/src/ATen/core/boxing/impl/boxing.h       |  2 +-
 .../impl/make_boxed_from_unboxed_functor.h    | 38 +++++-----
 .../ATen/core/op_registration/infer_schema.h  |  6 +-
 .../core/op_registration/op_registration.h    | 22 +++---
 .../ATen/cpu/vec/vec256/zarch/vec256_zarch.h  | 72 +++++++++----------
 .../src/ATen/cpu/vec/vec512/vec512_bfloat16.h |  4 +-
 aten/src/ATen/cpu/vec/vec512/vec512_float.h   |  2 +-
 aten/src/ATen/cuda/CUDASparseDescriptors.h    | 16 ++---
 aten/src/ATen/cuda/detail/IndexUtils.cuh      |  2 +-
 aten/src/ATen/native/DistributionTemplates.h  | 12 ++--
 aten/src/ATen/native/Dropout.cpp              |  2 +-
 aten/src/ATen/native/EmbeddingBag.cpp         |  4 +-
 aten/src/ATen/native/LossCTC.cpp              |  4 +-
 aten/src/ATen/native/LossNLL.cpp              |  2 +-
 aten/src/ATen/native/LossNLL2d.cpp            |  2 +-
 aten/src/ATen/native/Math.h                   | 14 ++--
 aten/src/ATen/native/UpSample.h               |  6 +-
 aten/src/ATen/native/UpSampleBicubic2d.cpp    |  2 +-
 .../ATen/native/cpu/DistributionKernels.cpp   | 12 ++--
 aten/src/ATen/native/cpu/FillKernel.cpp       |  2 +-
 aten/src/ATen/native/cpu/IndexKernel.cpp      |  4 +-
 aten/src/ATen/native/cpu/Loops.h              |  2 +-
 aten/src/ATen/native/cpu/MaxPoolKernel.cpp    |  4 +-
 .../ATen/native/cpu/RangeFactoriesKernel.cpp  |  2 +-
 aten/src/ATen/native/cpu/Reduce.h             |  6 +-
 .../src/ATen/native/cpu/group_norm_kernel.cpp |  4 +-
 .../src/ATen/native/cpu/layer_norm_kernel.cpp |  2 +-
 aten/src/ATen/native/cuda/Normalization.cuh   |  6 +-
 aten/src/ATen/native/miopen/Conv_miopen.cpp   |  2 +-
 aten/src/ATen/native/miopen/RNN_miopen.cpp    |  2 +-
 .../native/quantized/cpu/XnnpackUtils.cpp     |  2 +-
 aten/src/ATen/native/quantized/cpu/qconv.cpp  |  2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |  2 +-
 .../native/sparse/cuda/SparseCsrTensorMath.cu |  2 +-
 .../ATen/native/sparse/cuda/SparseMatMul.cu   | 16 ++---
 .../sparse/cuda/SparseSemiStructuredOps.cu    |  8 +--
 .../transformers/cuda/flash_attn/dropout.h    |  2 +-
 .../cuda/mem_eff_attention/kernels/cutlassB.h | 18 ++---
 .../cuda/mem_eff_attention/kernels/cutlassF.h | 18 ++---
 .../kernels/generate_kernels.py               |  2 +-
 aten/src/ATen/native/vulkan/api/Utils.h       |  2 +-
 aten/src/ATen/test/vec_test_all_types.h       |  4 +-
 c10/core/DeviceArray.h                        |  2 +-
 c10/core/TensorImpl.h                         |  2 +-
 c10/test/util/string_view_test.cpp            | 18 ++---
 c10/util/ArrayRef.h                           |  2 +-
 c10/util/SmallVector.h                        |  2 +-
 c10/util/intrusive_ptr.h                      | 18 ++---
 c10/util/strong_type.h                        | 12 ++--
 caffe2/perfkernels/embedding_lookup_idx.cc    |  2 +-
 test/cpp/api/static.cpp                       |  3 +-
 torch/_inductor/codecache.py                  |  2 +-
 torch/_inductor/codegen/cpp_prefix.h          |  2 +-
 .../include/torch/nn/modules/container/any.h  |  2 +-
 .../torch/nn/modules/container/named_any.h    |  2 +-
 .../inductor/aoti_runtime/arrayref_tensor.h   |  2 +-
 torch/csrc/jit/backends/backend.h             |  2 +-
 torch/csrc/jit/runtime/argument_spec.h        |  8 +--
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    |  2 +-
 torch/csrc/xpu/Module.cpp                     |  2 +-
 torchgen/gen.py                               |  2 +-
 69 files changed, 233 insertions(+), 238 deletions(-)

diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index b1f4ebe62e732..d88250fbdd08c 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -80,7 +80,7 @@ class DictEntryRef final {
 
   template<class Value_>
   void setValue(Value_&& value) const {
-    static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of setValue()");
+    static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of setValue()");
     iterator_->second = Value(std::forward<Value_>(value));
   }
 
diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
index c48d7ec38ae5a..5a4302836cb9a 100644
--- a/aten/src/ATen/core/Dict_inl.h
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -69,8 +69,8 @@ Dict<Key, Value>::Dict()
   :Dict(make_intrusive<detail::DictImpl>(
       detail::DictImpl::dict_map_type(),
       detail::DictImpl::DictElementTypes{getTypePtr<Key>(), getTypePtr<Value>()})) {
-  static_assert(!std::is_same<Key, IValue>::value, "This constructor is not valid for Dict<IValue, _>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
-  static_assert(!std::is_same<Value, IValue>::value, "This constructor is not valid for Dict<_, IValue>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+  static_assert(!std::is_same_v<Key, IValue>, "This constructor is not valid for Dict<IValue, _>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+  static_assert(!std::is_same_v<Value, IValue>, "This constructor is not valid for Dict<_, IValue>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
 }
 
 template<class Key, class Value>
@@ -78,8 +78,8 @@ Dict<Key, Value>::Dict(TypePtr keyType, TypePtr valueType)
 : Dict(make_intrusive<detail::DictImpl>(
     detail::DictImpl::dict_map_type(),
     detail::DictImpl::DictElementTypes {std::move(keyType), std::move(valueType)})) {
-  static_assert(std::is_same<Key, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
-  static_assert(std::is_same<Value, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
+  static_assert(std::is_same_v<Key, IValue>, "This constructor is only valid for c10::impl::GenericDict.");
+  static_assert(std::is_same_v<Value, IValue>, "This constructor is only valid for c10::impl::GenericDict.");
 }
 
 template<class Key, class Value>
@@ -118,8 +118,8 @@ void Dict<Key, Value>::clear() const {
 template<class Key, class Value>
 template<class Key_, class Value_>
 std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) const {
-  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
-  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
+  static_assert(std::is_constructible_v<Key, Key_>, "Wrong type for the key argument of Dict::insert");
+  static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of Dict::insert");
   auto inserted = impl_->dict.emplace(
       Key(std::forward<Key_>(key)),
       Value(std::forward<Value_>(value)));
@@ -129,8 +129,8 @@ std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Ke
 template<class Key, class Value>
 template<class Key_, class Value_>
 std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert_or_assign(Key_&& key, Value_&& value) const {
-  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert_or_assign");
-  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible_v<Key, Key_>, "Wrong type for the key argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of Dict::insert_or_assign");
   auto inserted = impl_->dict.insert_or_assign(
     Key(std::forward<Key_>(key)),
     Value(std::forward<Value_>(value)));
diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h
index 39004008d0070..c0d9a6212a841 100644
--- a/aten/src/ATen/core/DistributionsHelper.h
+++ b/aten/src/ATen/core/DistributionsHelper.h
@@ -42,10 +42,10 @@ struct uniform_int_from_to_distribution {
   template <typename RNG>
   C10_HOST_DEVICE inline T operator()(RNG generator) {
     if ((
-      std::is_same<T, int64_t>::value ||
-      std::is_same<T, double>::value ||
-      std::is_same<T, float>::value ||
-      std::is_same<T, at::BFloat16>::value) && range_ >= 1ULL << 32)
+      std::is_same_v<T, int64_t> ||
+      std::is_same_v<T, double> ||
+      std::is_same_v<T, float> ||
+      std::is_same_v<T, at::BFloat16>) && range_ >= 1ULL << 32)
     {
       return transformation::uniform_int_from_to<T>(generator->random64(), range_, base_);
     } else {
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index 0d223122599c4..3e61fa24ee02a 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -21,7 +21,7 @@ List<T>::List()
 : List(make_intrusive<c10::detail::ListImpl>(
   typename c10::detail::ListImpl::list_type(),
   getTypePtr<T>())) {
-  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType) instead.");
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType) instead.");
 }
 
 template<class T>
@@ -29,7 +29,7 @@ List<T>::List(ArrayRef<T> values)
 : List(make_intrusive<c10::detail::ListImpl>(
     typename c10::detail::ListImpl::list_type(),
     getTypePtr<T>())) {
-  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
   impl_->list.reserve(values.size());
   for (const T& element : values) {
     impl_->list.push_back(element);
@@ -39,7 +39,7 @@ List<T>::List(ArrayRef<T> values)
 template<class T>
 List<T>::List(std::initializer_list<T> initial_values)
 : List(ArrayRef<T>(initial_values)) {
-  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
 }
 
 template<class T>
@@ -47,7 +47,7 @@ List<T>::List(TypePtr elementType)
 : List(make_intrusive<c10::detail::ListImpl>(
     typename c10::detail::ListImpl::list_type(),
     std::move(elementType))) {
-  static_assert(std::is_same<T, IValue>::value || std::is_same<T, c10::intrusive_ptr<ivalue::Future>>::value,
+  static_assert(std::is_same_v<T, IValue> || std::is_same<T, c10::intrusive_ptr<ivalue::Future>>::value,
                 "This constructor is only valid for c10::impl::GenericList or List<Future>.");
 }
 
diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h
index 8172cf31e7522..de887a024c22f 100644
--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@@ -72,7 +72,7 @@ template <typename T>
 auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
   // Return the grad argument in case of a hook with void return type to have an
   // std::function with Tensor return type
-  static_assert(std::is_same<decltype(hook(Tensor())), void>::value,
+  static_assert(std::is_same_v<decltype(hook(Tensor())), void>,
                 "Expected hook to return void");
   return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
     TensorRef grad(grad_base);
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 35ee3b358c991..37b9e62fcdea9 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -95,7 +95,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
   template <class T>
   T* GetMutable() {
     static_assert(
-        std::is_default_constructible<T>::value,
+        std::is_default_constructible_v<T>,
         "GetMutable can't be called with non-default-constructible types. "
         "Try using specialized methods");
     if (IsType<T>()) {
diff --git a/aten/src/ATen/core/boxing/BoxedKernel_impl.h b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
index 421b85cca3ec5..bffed5bf95440 100644
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@@ -80,7 +80,7 @@ inline BoxedKernel BoxedKernel::makeNamedNotSupported() {
 
 template<class KernelFunctor>
 inline BoxedKernel BoxedKernel::makeFromFunctor(std::unique_ptr<KernelFunctor> kernelFunctor) {
-    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+    static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
     return BoxedKernel(
         std::move(kernelFunctor),
         [](OperatorKernel* kernel, const OperatorHandle& op, DispatchKeySet ks, Stack* stack) {
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index d505b30575834..8ce2c3760aecc 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -162,7 +162,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr<Ope
   // This assertion is costly for build time so it's debug-gated.
     static_assert(guts::is_functor<KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
 #endif
-    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+    static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
 
     auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed<KernelFunctor>::call;
     void* void_unboxed_fn = reinterpret_cast<void*>(unboxed_fn);
@@ -184,7 +184,7 @@ inline KernelFunction KernelFunction::makeFromBoxedFunctor(std::unique_ptr<Kerne
 template<class FuncPtr, bool AllowLegacyTypes>
 inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) {
     static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
-    static_assert(!std::is_same<typename FuncPtr::FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+    static_assert(!std::is_same_v<typename FuncPtr::FuncType, BoxedKernelFunction>, "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
 #if defined(__GNUC__) && defined(__SANITIZE_ADDRESS__) && !defined(__CUDACC__)
     TORCH_INTERNAL_ASSERT(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
 #else
@@ -207,7 +207,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr)
 template<bool AllowLegacyTypes, class FuncType>
 inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) {
     static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
-    static_assert(!std::is_same<FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+    static_assert(!std::is_same_v<FuncType, BoxedKernelFunction>, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
     TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
 
     return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index e109b808ff0c2..a1a80588d1d36 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -383,7 +383,7 @@ struct BoxedKernelWrapper<
     // that the last RetCount elements are of type `Tensor&`.
     auto result = guts::tuple_take<ArgTuple, -RetCount>(ArgTuple{std::forward<Args>(args)...});
     static_assert(
-        std::is_same<Result, decltype(result)>::value,
+        std::is_same_v<Result, decltype(result)>,
         "The parameter list of an op returning a tuple of Tensor references "
             "must end with an equal number of Tensor reference parameters."
     );
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 729691c1cd825..951228793b840 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -154,39 +154,39 @@ namespace impl {
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<List<T>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported input type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
   };
 
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<c10::ArrayRef<T>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported input type: ArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
   };
 
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<c10::OptionalArrayRef<T>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported input type: OptionalArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
   };
 
   template<class T, size_t N, bool AllowDeprecatedTypes>
   struct assert_is_valid_input_type<std::array<T, N>, AllowDeprecatedTypes>
   : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported input type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
   };
 
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same_v<float, T>>> {
     // There is no reason to support float when we have double. Keep the API lean.
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same_v<const char*, T>>> {
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported input type: const char*. Please use c10::string_view instead.");
   };
@@ -196,12 +196,12 @@ namespace impl {
       "You tried to register a kernel with an unsupported input type: vector<bool>. Please use List<bool> instead.");
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral_v<T> && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const c10::SymInt&, T>::value>> {
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same_v<const c10::SymInt&, T>>> {
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead.");
   };
@@ -238,7 +238,7 @@ namespace impl {
   : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
     static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
       "You tried to register a kernel with an unsupported output type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
-    static_assert(!std::is_same<Value, at::Scalar>::value,
+    static_assert(!std::is_same_v<Value, at::Scalar>,
       "You tried to register a kernel with an unsupported output type: Dict<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
   };
 
@@ -249,21 +249,21 @@ namespace impl {
       "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
     static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
       "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
-    static_assert(!std::is_same<Value, at::Scalar>::value,
+    static_assert(!std::is_same_v<Value, at::Scalar>,
       "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
   };
 
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<List<T>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported output type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
   };
 
   template<class T, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<std::vector<T>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported output type: std::vector<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
     // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel with an unsupported output type: std::vector<T>. Please use List<T> instead.");
   };
@@ -271,7 +271,7 @@ namespace impl {
   template<class T, size_t N, bool AllowDeprecatedTypes>
   struct assert_is_valid_output_type<std::array<T, N>, AllowDeprecatedTypes>
   : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
-    static_assert(!std::is_same<T, at::Scalar>::value,
+    static_assert(!std::is_same_v<T, at::Scalar>,
       "You tried to register a kernel with an unsupported output type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
   };
 
@@ -280,13 +280,13 @@ namespace impl {
   // there if they didn't exist, but we can show a better error message
   // in some common error scenarios.
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same_v<float, T>>> {
     // There is no reason to support float when we have double. Keep the API lean.
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same_v<const char*, T>>> {
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported output type: const char*. Please use c10::string_view instead.");
   };
@@ -296,7 +296,7 @@ namespace impl {
       "You tried to register a kernel with an unsupported output type: vector<bool>. Please use List<bool> instead.");
   };
   template<class T, bool AllowDeprecatedTypes>
-  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral_v<T> && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
     static_assert(guts::false_t<T>::value,
       "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
   };
@@ -417,7 +417,7 @@ namespace impl {
   struct return_to_ivalue final {};
 
   template<class T, bool AllowDeprecatedTypes>
-  struct return_to_ivalue<T, AllowDeprecatedTypes, std::enable_if_t<!std::is_same<at::Tensor&, T>::value>> final {
+  struct return_to_ivalue<T, AllowDeprecatedTypes, std::enable_if_t<!std::is_same_v<at::Tensor&, T>>> final {
     static IValue call(T&& v) {
       assert_is_valid_output_type<T, AllowDeprecatedTypes>();
       return c10::ivalue::from(std::move(v));
@@ -564,7 +564,7 @@ namespace impl {
 
   template<class KernelFunctor, bool AllowDeprecatedTypes>
   struct make_boxed_from_unboxed_functor final {
-    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value,
+    static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>,
       "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
 
     static void call(OperatorKernel* functor, const OperatorHandle&, DispatchKeySet dispatchKeySet, Stack* stack) {
@@ -574,7 +574,7 @@ namespace impl {
       // We don't want to expose the DispatchKeySet type to jit, so we don't include this argument on the stack.
       // See Note [Plumbing Keys Through The Dispatcher] for the background.
       using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<KernelFunctor>::parameter_types;
-      constexpr bool has_outputs = !std::is_same<void, ReturnType>::value;
+      constexpr bool has_outputs = !std::is_same_v<void, ReturnType>;
       constexpr size_t num_inputs = guts::typelist::size<ArgTypes>::value;
       if constexpr (has_outputs) {
         // Decay ReturnType to ReturnType_ so that if a reference gets returned, we actually store it by value
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index 2f845f7c4c10f..50dceeebdba2b 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -37,10 +37,10 @@ constexpr int checkStaticTypes() {
  // Give nice error messages for some of the common error cases.
  // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT
  static_assert(std::conjunction<
-     bool_t<!std::is_integral<Types>::value || std::is_same<Types, int8_t>::value || std::is_same<Types, int64_t>::value || std::is_same<Types, bool>::value>...
+     bool_t<!std::is_integral_v<Types> || std::is_same_v<Types, int8_t> || std::is_same_v<Types, int64_t> || std::is_same_v<Types, bool>>...
    >::value, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type");
  static_assert(std::conjunction<
-     bool_t<!std::is_same<Types, float>::value>...
+     bool_t<!std::is_same_v<Types, float>>...
    >::value, "INVALID TYPE: float is not supported as an argument type, use double instead");
  return 0;
 }
@@ -87,7 +87,7 @@ struct createReturns<std::tuple<ReturnTypes...>, void> final {
 };
 
 template<class ReturnType>
-struct createReturns<ReturnType, std::enable_if_t<!std::is_same<void, ReturnType>::value && !guts::is_instantiation_of<std::tuple, ReturnType>::value>> final {
+struct createReturns<ReturnType, std::enable_if_t<!std::is_same_v<void, ReturnType> && !guts::is_instantiation_of<std::tuple, ReturnType>::value>> final {
   static constexpr std::array<ArgumentDef, 1> call() {
     return createReturns<std::tuple<ReturnType>>::call();
   }
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index f309ee2f277b3..32f003c218ae4 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -159,8 +159,8 @@ class TORCH_API RegisterOperators final {
     template<class KernelFunctor, class... ConstructorParameters>
     // enable_if: only enable it if KernelFunctor is actually a functor
     std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && {
-      static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
-      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
 
       return std::move(*this).kernel(
         dispatch_key,
@@ -211,8 +211,8 @@ class TORCH_API RegisterOperators final {
     template<class KernelFunctor, class... ConstructorParameters>
     // enable_if: only enable it if KernelFunctor is actually a functor
     std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && {
-      static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
-      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
 
       return std::move(*this).kernel(
         std::nullopt,
@@ -239,7 +239,7 @@ class TORCH_API RegisterOperators final {
     template<class FuncType, FuncType* kernel_func>
     // enable_if: only enable it if FuncType is actually a function
     std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key) && {
-      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
       static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
 
       return std::move(*this).kernel(
@@ -268,7 +268,7 @@ class TORCH_API RegisterOperators final {
     template<class FuncType, FuncType* kernel_func>
     // enable_if: only enable it if FuncType is actually a function
     std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel() && {
-      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
       static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
 
       return std::move(*this).kernel(
@@ -283,7 +283,7 @@ class TORCH_API RegisterOperators final {
     template<class FuncType>
     // enable_if: only enable it if FuncType is actually a function
     std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && {
-      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
       TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
 
       return std::move(*this).kernel(
@@ -298,7 +298,7 @@ class TORCH_API RegisterOperators final {
     template<class FuncType>
     // enable_if: only enable it if FuncType is actually a function
     std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel(FuncType* kernel_func) && {
-      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
       TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
 
       return std::move(*this).kernel(
@@ -518,7 +518,7 @@ class TORCH_API RegisterOperators final {
    */
    template<class FuncType>
    // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction.
-   std::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, RegisterOperators&&>
+   std::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, RegisterOperators&&>
    op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
      constexpr bool AllowLegacyTypes = true;
      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
@@ -549,7 +549,7 @@ class TORCH_API RegisterOperators final {
     // enable_if: only enable it if Lambda is actually a stateless lambda
     std::enable_if_t<guts::is_functor<Lambda>::value && guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
     op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
-      static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
 
       constexpr bool AllowLegacyTypes = true;
       return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
@@ -566,7 +566,7 @@ class TORCH_API RegisterOperators final {
     // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda
     std::enable_if_t<guts::is_functor<Lambda>::value && !guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
     op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
-      static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
 
       constexpr bool AllowLegacyTypes = true;
       return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
index 931da5678437b..c23f2e03381a0 100644
--- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
+++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -22,18 +22,18 @@ inline namespace CPU_CAPABILITY {
 template <typename T>
 constexpr bool is_zarch_implemented() {
   return (
-      std::is_same<T, float>::value || std::is_same<T, double>::value ||
-      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value ||
-      std::is_same<T, uint16_t>::value || std::is_same<T, int16_t>::value ||
-      std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value);
+      std::is_same_v<T, float> || std::is_same_v<T, double> ||
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t> ||
+      std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+      std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t>);
 }
 
 template <typename T>
 constexpr bool is_zarch_implemented_quant() {
   return (
-      std::is_same<T, c10::qint32>::value ||
-      std::is_same<T, c10::qint8>::value ||
-      std::is_same<T, c10::quint8>::value);
+      std::is_same_v<T, c10::qint32> ||
+      std::is_same_v<T, c10::qint8> ||
+      std::is_same_v<T, c10::quint8>);
 }
 
 template <typename T>
@@ -790,14 +790,14 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_unsigned<U>::value, int> = 0>
+      std::enable_if_t<!std::is_unsigned_v<U>, int> = 0>
   Vectorized<U> C10_ALWAYS_INLINE abs() const {
     return {vec_abs(_vec0), vec_abs(_vec1)};
   }
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_unsigned<U>::value, int> = 0>
+      std::enable_if_t<std::is_unsigned_v<U>, int> = 0>
   Vectorized<U> C10_ALWAYS_INLINE abs() const {
     return {_vec0, _vec1};
   }
@@ -828,7 +828,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<U> angle() const {
     auto tmp = blendv(
         Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
@@ -837,7 +837,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
   Vectorized<U> angle() const {
     return blendv(
         Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
@@ -855,7 +855,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   int zero_mask() const {
     auto cmp = (*this == Vectorized<U>(0));
     constexpr auto mask_zero_bits = GetBpermZeroMask<U>();
@@ -902,7 +902,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
   inline Vectorized<T> mapOrdinary(float (*const f)(float)) const {
     float a00 = f(_vec0[0]);
     float a01 = f(_vec0[1]);
@@ -917,14 +917,14 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
   inline Vectorized<T> mapOrdinary(double (*const f)(double)) const {
     return Vectorized<T>(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1]));
   }
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
   inline Vectorized<T> mapOrdinary(
       float (*const f)(float, float),
       const Vectorized<T>& b) const {
@@ -941,7 +941,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
   inline Vectorized<T> mapOrdinary(
       double (*const f)(double, double),
       const Vectorized<T>& b) const {
@@ -956,7 +956,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename FloatOp,
       typename DoubleOp,
       typename U = T,
-      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
   inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
     vtype a0 = f(_vec0);
     vtype a1 = f(_vec1);
@@ -967,7 +967,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename FloatOp,
       typename DoubleOp,
       typename U = T,
-      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
   inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
     return Vectorized<T>(d(_vec0), d(_vec1));
   }
@@ -976,7 +976,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename FloatOp,
       typename DoubleOp,
       typename U = T,
-      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
   inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
       const {
     vtype a0 = f(_vec0, b._vec0);
@@ -988,7 +988,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
       typename FloatOp,
       typename DoubleOp,
       typename U = T,
-      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
   inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
       const {
     return Vectorized<T>(d(_vec0, b._vec0), d(_vec1, b._vec1));
@@ -1112,7 +1112,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> minimum(const Vectorized<T>& other) const {
     return {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
   }
@@ -1120,7 +1120,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   /* Propagates NaN if either input is a NaN. */
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> minimum(const Vectorized<T>& other) const {
     Vectorized<T> tmp = {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
     tmp = blendv(tmp, *this, isnan());
@@ -1129,7 +1129,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> maximum(const Vectorized<T>& other) const {
     return {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
   }
@@ -1137,7 +1137,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   /* Propagates NaN if either input is a NaN. */
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> maximum(const Vectorized<T>& other) const {
     Vectorized<T> tmp = {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
     tmp = blendv(tmp, *this, isnan());
@@ -1146,7 +1146,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> clamp_min(const Vectorized<T>& min) const {
     return {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
   }
@@ -1154,7 +1154,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   /* Keeps NaN if actual value is NaN */
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> clamp_min(const Vectorized<T>& min) const {
     Vectorized<T> tmp = {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
     return blendv(tmp, *this, isnan());
@@ -1162,7 +1162,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<!std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> clamp_max(const Vectorized<T>& max) const {
     return {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
   }
@@ -1170,7 +1170,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
   /* Keeps NaN if actual value is NaN */
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> clamp_max(const Vectorized<T>& max) const {
     Vectorized<T> tmp = {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
     return blendv(tmp, *this, isnan());
@@ -1178,7 +1178,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, float>, int> = 0>
   Vectorized<T> swapped() const {
     auto swap_mask = GetSwapMaskFloat();
     vtype v0 = vec_perm(_vec0, _vec0, swap_mask);
@@ -1188,7 +1188,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, double>, int> = 0>
   Vectorized<T> swapped() const {
     vtype v0 = {_vec0[1], _vec0[0]};
     vtype v1 = {_vec1[1], _vec1[0]};
@@ -1197,7 +1197,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   static Vectorized<T> mergee(Vectorized<T>& first, Vectorized<T>& second) {
     return {
         vec_mergee(first._vec0, second._vec0),
@@ -1206,7 +1206,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   static Vectorized<T> mergeo(Vectorized<T>& first, Vectorized<T>& second) {
     return {
         vec_mergeo(first._vec0, second._vec0),
@@ -1243,21 +1243,21 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> mergee() const {
     return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)};
   }
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+      std::enable_if_t<std::is_floating_point_v<U>, int> = 0>
   Vectorized<T> mergeo() const {
     return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)};
   }
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, uint8_t>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, uint8_t>, int> = 0>
   Vectorized<int32_t> to_vec_float_helper() const {
     int32_t values[8] = {
       _vec0[0],
@@ -1278,7 +1278,7 @@ struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
 
   template <
       typename U = T,
-      std::enable_if_t<std::is_same<U, int32_t>::value, int> = 0>
+      std::enable_if_t<std::is_same_v<U, int32_t>, int> = 0>
   Vectorized<uint8_t> to_vec_uint8_helper() const {
     // helper function for float to uint8_t conversion
     uint8_t values[8] = {
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
index dcdb682c56208..ba6e1c2a4d149 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -1384,7 +1384,7 @@ inline void transpose_mxn<BFloat16>(const BFloat16* src, int64_t ld_src, BFloat1
 }
 
 template <typename T, int M, int N,
-          typename std::enable_if_t<std::is_same<T, BFloat16>::value && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
+          typename std::enable_if_t<std::is_same_v<T, BFloat16> && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
 inline void transpose_mxn(const BFloat16* src, int64_t ld_src, BFloat16* dst, int64_t ld_dst) {
   transpose_mxn<BFloat16>(src, ld_src, dst, ld_dst, M, N);
 }
@@ -1426,7 +1426,7 @@ inline void transpose_mxn<Half>(const Half* src, int64_t ld_src, Half* dst, int6
 }
 
 template <typename T, int M, int N,
-          typename std::enable_if_t<std::is_same<T, Half>::value && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
+          typename std::enable_if_t<std::is_same_v<T, Half> && ((M <= 32 && M != 16) || (N <= 32 && N != 16)), int> = 0>
 inline void transpose_mxn(const Half* src, int64_t ld_src, Half* dst, int64_t ld_dst) {
   transpose_mxn<Half>(src, ld_src, dst, ld_dst, M, N);
 }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
index 843e2dfcb8795..0771d95add723 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@@ -701,7 +701,7 @@ inline void transpose_mxn<float>(const float* src, int64_t ld_src, float* dst, i
 }
 
 template <typename T, int M, int N,
-          typename std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+          typename std::enable_if_t<std::is_same_v<T, float>, int> = 0>
 inline void transpose_mxn(const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
   transpose_mxn<float>(src, ld_src, dst, ld_dst, M, N);
 }
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h
index 36e1530e284fb..7fc482f2a3fbd 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@@ -61,15 +61,15 @@ class ConstCuSparseDescriptor {
 #endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
 
 #if defined(USE_ROCM)
-using cusparseMatDescr = std::remove_pointer<hipsparseMatDescr_t>::type;
-using cusparseDnMatDescr = std::remove_pointer<hipsparseDnMatDescr_t>::type;
-using cusparseDnVecDescr = std::remove_pointer<hipsparseDnVecDescr_t>::type;
-using cusparseSpMatDescr = std::remove_pointer<hipsparseSpMatDescr_t>::type;
-using cusparseSpMatDescr = std::remove_pointer<hipsparseSpMatDescr_t>::type;
-using cusparseSpGEMMDescr = std::remove_pointer<hipsparseSpGEMMDescr_t>::type;
+using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
+using cusparseDnMatDescr = std::remove_pointer_t<hipsparseDnMatDescr_t>;
+using cusparseDnVecDescr = std::remove_pointer_t<hipsparseDnVecDescr_t>;
+using cusparseSpMatDescr = std::remove_pointer_t<hipsparseSpMatDescr_t>;
+using cusparseSpMatDescr = std::remove_pointer_t<hipsparseSpMatDescr_t>;
+using cusparseSpGEMMDescr = std::remove_pointer_t<hipsparseSpGEMMDescr_t>;
 #if AT_USE_HIPSPARSE_TRIANGULAR_SOLVE()
-using bsrsv2Info = std::remove_pointer<bsrsv2Info_t>::type;
-using bsrsm2Info = std::remove_pointer<bsrsm2Info_t>::type;
+using bsrsv2Info = std::remove_pointer_t<bsrsv2Info_t>;
+using bsrsm2Info = std::remove_pointer_t<bsrsm2Info_t>;
 #endif
 #endif
 
diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh
index db8519389e9ff..367ab10d3d3bb 100644
--- a/aten/src/ATen/cuda/detail/IndexUtils.cuh
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh
@@ -23,7 +23,7 @@ getTensorInfo(const at::TensorBase &t) {
 
   scalar* data_ptr = nullptr;
 
-  if constexpr (std::is_const<scalar>::value) {
+  if constexpr (std::is_const_v<scalar>) {
     data_ptr = t.const_data_ptr<scalar>();
   } else {
     data_ptr = t.mutable_data_ptr<scalar>();
diff --git a/aten/src/ATen/native/DistributionTemplates.h b/aten/src/ATen/native/DistributionTemplates.h
index 38c171e56dfae..c6013b6fbae5f 100644
--- a/aten/src/ATen/native/DistributionTemplates.h
+++ b/aten/src/ATen/native/DistributionTemplates.h
@@ -42,9 +42,9 @@ namespace at::native::templates {
 template<typename scalar_t>
 int64_t update_from(int64_t from) {
   static_assert(
-    std::is_floating_point<scalar_t>::value ||
-    std::is_same<scalar_t, at::Half>::value ||
-    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+    std::is_floating_point_v<scalar_t> ||
+    std::is_same_v<scalar_t, at::Half> ||
+    std::is_same_v<scalar_t, at::BFloat16>, "scalar_t must be floating-point type");
   const auto from_plus_1 = static_cast<int64_t>(static_cast<scalar_t>(from + 1));
   if (from_plus_1 < from) {
     int64_t from_ = std::abs(from + 1);
@@ -59,9 +59,9 @@ int64_t update_from(int64_t from) {
 template<typename scalar_t>
 int64_t update_to(int64_t to) {
   static_assert(
-    std::is_floating_point<scalar_t>::value ||
-    std::is_same<scalar_t, at::Half>::value ||
-    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+    std::is_floating_point_v<scalar_t> ||
+    std::is_same_v<scalar_t, at::Half> ||
+    std::is_same_v<scalar_t, at::BFloat16>, "scalar_t must be floating-point type");
   const auto to_minus_1 = static_cast<int64_t>(static_cast<scalar_t>(to - 1));
   if (to_minus_1 >= to) {
     int64_t to_ = std::abs(to - 1);
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index 366a00487ff5f..f7d32579165b4 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -25,7 +25,7 @@ namespace at::native {
 namespace {
 
 template<bool inplace>
-using Ctype = typename std::conditional<inplace, Tensor&, Tensor>::type;
+using Ctype = typename std::conditional_t<inplace, Tensor&, Tensor>;
 
 Tensor make_feature_noise(const Tensor& input) {
   auto input_sizes = input.sym_sizes();
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 58dc1b991d267..ea97ac8a5ad38 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -280,7 +280,7 @@ index_select_add(
           for (int64_t i = start_idx; i < end_idx; i++) {
             // Convert FP32 intermediate buffer result back to 16 bit for
             // output dtype
-            if constexpr (std::is_same<data_t, at::Half>::value) {
+            if constexpr (std::is_same_v<data_t, at::Half>) {
               // FP16
               for (const auto d : c10::irange(ddim)) {
                 (output_data + i * ddim)[d] =
@@ -662,7 +662,7 @@ index_select_scale_add(
           for (int64_t i = start_idx; i < end_idx; i++) {
             // Convert FP32 intermediate buffer result back to 16 bit for
             // output dtype
-            if constexpr (std::is_same<data_t, at::Half>::value) {
+            if constexpr (std::is_same_v<data_t, at::Half>) {
               // FP16
               for (const auto d : c10::irange(ddim)) {
                 (output_data + i * ddim)[d] =
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 336ddfd704439..530f3cf066ec7 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -130,7 +130,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
   // log_probs: input_len x batch_size x num_labels
   // targets [int64]: batch_size x target_length OR sum(target_lengths)
   constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  using target_t = typename std::conditional_t<target_scalar_type == kInt, int, int64_t>;
 
   Tensor neg_log_likelihood, log_alpha;
   size_t tg_target_stride;
@@ -233,7 +233,7 @@ template<typename scalar_t, ScalarType target_scalar_type>
 Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
                                       const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
   constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  using target_t = typename std::conditional_t<target_scalar_type == kInt, int, int64_t>;
   int64_t max_input_length = log_probs.size(0);
   int64_t batch_size = log_probs.size(1);
   int64_t num_labels = log_probs.size(2);
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 0b07e79551659..3930bb8a50e65 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -147,7 +147,7 @@ inline Tensor optional_contiguous(const Tensor& source) {
 // or nullptr if the tensor is undefined.
 template <typename scalar_t>
 inline scalar_t* optional_data(const Tensor& source) {
-  if constexpr (std::is_const<scalar_t>::value) {
+  if constexpr (std::is_const_v<scalar_t>) {
     return source.defined() ? source.const_data_ptr<scalar_t>() : nullptr;
   } else {
     return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 13c575a1a7bb3..4e63a300c0207 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -35,7 +35,7 @@ inline Tensor optional_contiguous(const Tensor& source) {
 // or nullptr if the tensor is undefined.
 template <typename scalar_t>
 inline scalar_t* optional_data(const Tensor& source) {
-  if constexpr (std::is_const<scalar_t>::value) {
+  if constexpr (std::is_const_v<scalar_t>) {
     return source.defined() ? source.const_data_ptr<scalar_t>() : nullptr;
   } else {
     return source.defined() ? source.data_ptr<scalar_t>() : nullptr;
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index e04265e44f8e5..fd4ca881d2c6d 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -625,7 +625,7 @@ static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
   // exp(a - x).
 
   scalar_t ax, fac, res, num, numfac;
-  static scalar_t MAXLOG = std::is_same<scalar_t,double>::value ?
+  static scalar_t MAXLOG = std::is_same_v<scalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
   static scalar_t EXP1 = 2.718281828459045;
   static scalar_t lanczos_g = 6.024680040776729583740234375;
@@ -655,7 +655,7 @@ static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
-  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+  static scalar_t MACHEP = std::is_same_v<scalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   static int MAXITER = 2000;
 
@@ -693,7 +693,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   scalar_t sum = 0;
   scalar_t term, logx;
   static scalar_t MAXITER = 2000;
-  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+  static scalar_t MACHEP = std::is_same_v<scalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -942,7 +942,7 @@ static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam
 
   int k, n, sgn;
   int maxpow = 0;
-  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+  static scalar_t MACHEP = std::is_same_v<scalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   scalar_t lambda = x / a;
   scalar_t sigma = (x - a) / a;
@@ -1007,11 +1007,11 @@ static scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
   scalar_t ans, ax, c, yc, r, t, y, z;
   scalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
   int MAXITER = 2000;
-  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+  static scalar_t MACHEP = std::is_same_v<scalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static scalar_t BIG = std::is_same<scalar_t,double>::value ?
+  static scalar_t BIG = std::is_same_v<scalar_t,double> ?
     4.503599627370496e15 : 16777216.;
-  static scalar_t BIGINV = std::is_same<scalar_t,double>::value ?
+  static scalar_t BIGINV = std::is_same_v<scalar_t,double> ?
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 769201804eafa..479bd8e359686 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -477,17 +477,17 @@ inline void compute_source_index_and_lambda(
 
 // It will not be used by data types other than BFloat16 and Half.
 template <typename scalar_in, typename scalar_out,
-          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same<scalar_in, float>::value, int> = 0>
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same_v<scalar_in, float>, int> = 0>
 void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
   TORCH_CHECK((is_reduced_floating_point_v<scalar_out>),
               "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.")
-  TORCH_CHECK((std::is_same<scalar_in, float>::value),
+  TORCH_CHECK((std::is_same_v<scalar_in, float>),
               "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.")
   return;
 }
 
 template <typename scalar_in, typename scalar_out,
-          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same<scalar_in, float>::value, int> = 0>
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same_v<scalar_in, float>, int> = 0>
 void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
   using bVec = Vectorized<scalar_out>;
   using fVec = Vectorized<float>;
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 44892ebd4aad8..b02d809bb57a6 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -129,7 +129,7 @@ static void upsample_bicubic2d_backward_out_frame(
   at::parallel_for(0, channels, at::internal::GRAIN_SIZE / output_slice_size / 4, [&](int64_t start, int64_t end) {
     opmath_t* acc_data_ptr = nullptr;
     std::unique_ptr<opmath_t[]> buffer_data;
-    if constexpr (!std::is_same<scalar_t, opmath_t>::value) {
+    if constexpr (!std::is_same_v<scalar_t, opmath_t>) {
       buffer_data = std::make_unique<opmath_t[]>(input_slice_size);
       acc_data_ptr = buffer_data.get();
       memset(acc_data_ptr, 0, sizeof(opmath_t) * input_slice_size);
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index 7ee014058d70d..ed3d00fa6c86f 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -60,7 +60,7 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Gen
   AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
   self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
     at::Tensor tmp_int_tensor;
-    if (std::is_same<scalar_t, int>::value && contig) {
+    if (std::is_same_v<scalar_t, int> && contig) {
       tmp_int_tensor = self;
     } else {
       tmp_int_tensor = at::empty(self.sizes(), self.options().dtype(at::kInt));
@@ -81,7 +81,7 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Gen
 
         // vectorized copy if using buffer and contiguous, i.e., being non-int
         // type and contiguous
-        if (!std::is_same<scalar_t, int>::value && contig) {
+        if (!std::is_same_v<scalar_t, int> && contig) {
           scalar_t *self_seg = self_ptr + begin;
           int* tmp_seg = sample_int_ptr + begin;
           at::vec::convert<int, scalar_t>(tmp_seg, self_seg, len);
@@ -129,17 +129,17 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<G
 
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "exponential_cpu", [&] {
       at::Tensor tmp_tensor;
-      constexpr bool is_df = std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value;
+      constexpr bool is_df = std::is_same_v<scalar_t, float> || std::is_same_v<scalar_t, double>;
       if (is_df && contig) {
         tmp_tensor = self;
-      } else if (std::is_same<scalar_t, double>::value) {
+      } else if (std::is_same_v<scalar_t, double>) {
         tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kDouble));
       } else {
         tmp_tensor = at::empty(self.sizes(), self.options().dtype(at::kFloat));
       }
 
       scalar_t *self_ptr = self.data_ptr<scalar_t>();
-      using tmp_scalar_t = typename std::conditional_t<std::is_same<scalar_t, double>::value, double, float>;
+      using tmp_scalar_t = typename std::conditional_t<std::is_same_v<scalar_t, double>, double, float>;
       tmp_scalar_t *sample_ptr = tmp_tensor.data_ptr<tmp_scalar_t>();
 
       // Intel MKL vRngExponential variate originally does not exclude 0.
@@ -159,7 +159,7 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<G
         int64_t len = end - begin;
         if (len > 0) {
           VSLStreamStatePtr stream;
-          if constexpr (std::is_same<scalar_t, double>::value) {
+          if constexpr (std::is_same_v<scalar_t, double>) {
             vslNewStream(&stream, VSL_BRNG_MCG31, seed);
             vslSkipAheadStream(stream, begin);
             vdRngExponential(VSL_RNG_METHOD_EXPONENTIAL_ICDF, stream, len,
diff --git a/aten/src/ATen/native/cpu/FillKernel.cpp b/aten/src/ATen/native/cpu/FillKernel.cpp
index 43a562306e341..8365ac4954b7d 100644
--- a/aten/src/ATen/native/cpu/FillKernel.cpp
+++ b/aten/src/ATen/native/cpu/FillKernel.cpp
@@ -16,7 +16,7 @@ namespace {
 template <typename scalar_t>
 void fill_non_native_type(TensorIterator& iter, const Scalar& value_scalar) {
   auto value = value_scalar.to<scalar_t>().x;
-  using H = typename std::make_signed<decltype(value)>::type;  // Signed type has more acceleration
+  using H = typename std::make_signed_t<decltype(value)>;  // Signed type has more acceleration
   // Reserve the representation of value. static_cast<H>(value) is implementation defined.
   H val = *reinterpret_cast<H*>(std::addressof(value));
   cpu_kernel_vec</*check_dynamic_cast=*/false>(
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index 0b3d13beb9d58..9d1cc477cbbc6 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -388,7 +388,7 @@ void cpu_masked_select_serial_kernel(TensorIterator& iter, const func_t& f) {
     char* mask = data[2];
     for (const auto i : c10::irange(n)) {
       mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
-      if constexpr (!std::is_same<mask_t, bool>::value) {
+      if constexpr (!std::is_same_v<mask_t, bool>) {
         TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
       }
       if (mask_value) {
@@ -426,7 +426,7 @@ void cpu_masked_select_kernel(TensorIterator& iter, const func_t& f) {
     char* mask_prefix_sum = data[3];
     for (const auto i : c10::irange(n)) {
       mask_t mask_value = *(mask_t*)(mask + strides[2] * i);
-      if constexpr (!std::is_same<mask_t, bool>::value) {
+      if constexpr (!std::is_same_v<mask_t, bool>) {
         TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only");
       }
       if (mask_value) {
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index a910a329482b8..37c810c2dd991 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -172,7 +172,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
   using traits = function_traits<func_t>;
 
   using result_type = typename traits::result_type;
-  constexpr int num_outputs = std::tuple_size<result_type>::value;
+  constexpr int num_outputs = std::tuple_size_v<result_type>;
   constexpr int ntensors = traits::arity + num_outputs;
 
   // Copying strides to temporary array helps auto vectorization in older GCC
diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
index 15b784f055216..9bfc16832767c 100644
--- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp
@@ -19,7 +19,7 @@ namespace {
 
 template <typename scalar_t>
 bool is_nan(scalar_t v) {
-  if (std::is_integral<scalar_t>::value || std::is_same<scalar_t, unsigned char>::value) {
+  if (std::is_integral_v<scalar_t> || std::is_same_v<scalar_t, unsigned char>) {
     return false;
   }
   return std::isnan(v);
@@ -429,7 +429,7 @@ void cpu_max_pool_channels_last(
     // temp buffer holding max value with opmath_t
     std::unique_ptr<opmath_t []> max_arr;
     opmath_t* max_ptr = nullptr;
-    if (!std::is_same<scalar_t, opmath_t>::value) {
+    if (!std::is_same_v<scalar_t, opmath_t>) {
       max_arr = std::make_unique<opmath_t[]>(size);
       max_ptr = max_arr.get();
     }
diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
index 28adc7040cfb8..c50d000fc9447 100644
--- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
+++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
@@ -45,7 +45,7 @@ static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, cons
 static void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "linspace_cpu", [&]() {
     // step should be of double type for all integral types
-    using step_t = std::conditional_t<std::is_integral<scalar_t>::value, double, scalar_t>;
+    using step_t = std::conditional_t<std::is_integral_v<scalar_t>, double, scalar_t>;
     const scalar_t start = scalar_start.to<scalar_t>();
     const scalar_t end = scalar_end.to<scalar_t>();
     // Cast `end` and `start` to `step_t`, since range can be larger than scalar_t for integral types
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 09a8ba3b170fa..6c9efbb0f6e7f 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -114,7 +114,7 @@ inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_
 
 template<typename traits, typename res_t>
 static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
-  // static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
+  // static_assert(std::is_same_v<res_t, typename traits::arg2_t>, "data types must match");
   if (index < num_outputs) {
     char *out = (char *) iter.data_ptr(index);
     *(res_t *) out = result;
@@ -202,7 +202,7 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
       typename c_traits::result_type>::value,
     "all accumulate types must match");
   static_assert(
-    std::is_default_constructible<acc_t>::value,
+    std::is_default_constructible_v<acc_t>,
     "the accumulate type must be default-constructible"
   );
   const int num_outputs = iter.noutputs();
@@ -229,7 +229,7 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
       int max_threads = at::get_num_threads();
       AT_ASSERT(max_threads > 0);
       static_assert(
-        !std::is_same<acc_t, bool>::value,
+        !std::is_same_v<acc_t, bool>,
         "Concurrently modifying different references into std::vector<bool> is UB."
       );
       std::vector<acc_t> buffer((unsigned)max_threads, init);
diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index 0aee364b49d8c..b0411fa604fc2 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -1237,7 +1237,7 @@ ApplyInputGradientsChannelsLastRowMov(
 
 template <typename T, typename PT, typename opmath_t>
 inline typename std::
-    enable_if<std::is_same<T, opmath_t>::value, std::tuple<opmath_t, opmath_t>>::type
+    enable_if<std::is_same_v<T, opmath_t>, std::tuple<opmath_t, opmath_t>>::type
     CalcInternalGradientsChannelsLast(
     const T* X_data,
     const T* dY_data,
@@ -1292,7 +1292,7 @@ inline typename std::
 
 template <typename T, typename PT, typename opmath_t>
 inline typename std::
-    enable_if<!std::is_same<T, opmath_t>::value, std::tuple<opmath_t, opmath_t>>::type
+    enable_if<!std::is_same_v<T, opmath_t>, std::tuple<opmath_t, opmath_t>>::type
     CalcInternalGradientsChannelsLast(
         const T* X_data,
         const T* dY_data,
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index c2dbd0d7c7858..615a70c20d872 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -296,7 +296,7 @@ void layer_norm_backward_frame(
 }
 
 template <typename T, typename T2, typename opmath_t,
-          typename std::enable_if_t<is_reduced_floating_point_v<T> && std::is_same<T2, float>::value, int> = 0>
+          typename std::enable_if_t<is_reduced_floating_point_v<T> && std::is_same_v<T2, float>, int> = 0>
 void layer_norm_backward_frame(
     const T* dY_data,
     const T* X_data,
diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh
index 3b6f93ee36bea..554b53f666113 100644
--- a/aten/src/ATen/native/cuda/Normalization.cuh
+++ b/aten/src/ATen/native/cuda/Normalization.cuh
@@ -212,8 +212,8 @@ template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscal
 __global__ void batch_norm_transform_input_kernel(
     const GenericPackedTensorAccessor<const input_scalar_t, 3, RestrictPtrTraits, index_t> input,
     GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> output,
-    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> mean_,
-    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> var_or_invstd,
+    const GenericPackedTensorAccessor<typename std::conditional_t<train, stat_accscalar_t, stat_scalar_t>, 1, RestrictPtrTraits, index_t> mean_,
+    const GenericPackedTensorAccessor<typename std::conditional_t<train, stat_accscalar_t, stat_scalar_t>, 1, RestrictPtrTraits, index_t> var_or_invstd,
     const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> weight,
     const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> bias,
     stat_accscalar_t epsilon) {
@@ -582,7 +582,7 @@ __global__ void batch_norm_backward_elemt_kernel(
 template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
 static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> get_packed_accessor(
     const Tensor& t, c10::string_view var_name) {
-  constexpr auto expect_type = c10::CppTypeToScalarType<typename std::remove_const<scalar_t>::type>::value;
+  constexpr auto expect_type = c10::CppTypeToScalarType<typename std::remove_const_t<scalar_t>>::value;
   const auto actual_type = t.scalar_type();
   TORCH_CHECK(actual_type == expect_type, "Expected ", var_name,
               " to have type ", expect_type, " but got ", actual_type);
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 0a7081ef0bd15..35a725687574e 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -187,7 +187,7 @@ struct ConvolutionParams
 };
 // ConvolutionParams must be a POD because we read out its memory
 // contenst as char* when hashing
-static_assert(std::is_standard_layout<ConvolutionParams>::value, "ConvolutionParams not POD");
+static_assert(std::is_standard_layout_v<ConvolutionParams>, "ConvolutionParams not POD");
 
 void setConvolutionParams(
     ConvolutionParams* params, miopenHandle_t handle,
diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp
index e19243f70cdb4..8ac986b1af647 100644
--- a/aten/src/ATen/native/miopen/RNN_miopen.cpp
+++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp
@@ -803,7 +803,7 @@ std::tuple<Tensor, Tensor> unpack_hidden(const std::tuple<Tensor, Tensor>& hidde
 
 template<typename hidden_type>
 hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
-    static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
+    static_assert(std::is_same_v<hidden_type, void>, "pack_hidden not implemented for this type");
     TORCH_CHECK(false, "NOT IMPLEMENTED");
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.cpp b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.cpp
index 23f8049ed6a6a..6019817222b40 100644
--- a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.cpp
@@ -33,7 +33,7 @@ std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
 template <typename PT>
 void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out) {
   using T = typename PT::underlying;
-  static constexpr auto offset = std::is_same<T, uint8_t>::value ? 128 : 0;
+  static constexpr auto offset = std::is_same_v<T, uint8_t> ? 128 : 0;
   TORCH_CHECK(
       in.scalar_type() == c10::kQInt8,
       "q8_copy_int8_weight_and_add_offset: Expected input weight data type ",
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 15cb9ab5cb045..135bc0f37e9b5 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -654,7 +654,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
     if (!per_channel()) {
       w_zp = static_cast<underlying_t>(
           weight_contig.q_zero_point() +
-          (std::is_same<underlying_t, uint8_t>::value ? 128 : 0));
+          (std::is_same_v<underlying_t, uint8_t> ? 128 : 0));
 
       weight_tensor = at::native::empty_affine_quantized(
           weight_contig.sizes(),
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 9f2cf186e03b3..39925cd42d9f0 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -491,7 +491,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
     // prepare weights
     underlying_t w_zp = static_cast<underlying_t>(
         orig_weight.q_zero_point() +
-        (std::is_same<underlying_t, uint8_t>::value ? 128 : 0));
+        (std::is_same_v<underlying_t, uint8_t> ? 128 : 0));
 
    at::Tensor xnnp_weight = at::_empty_affine_quantized(
         orig_weight.sizes(),
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
index fe6a697481dba..f8923dd1a61c1 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCsrTensorMath.cu
@@ -754,7 +754,7 @@ void _apply_sparse_csr_linear_solve(
     scalar_t* values_ptr = values.data_ptr<scalar_t>();
     scalar_t* b_ptr = b.data_ptr<scalar_t>();
     scalar_t* x_ptr = x.data_ptr<scalar_t>();
-    auto CUDA_R_TYP = std::is_same<scalar_t, double>::value ? CUDA_R_64F : CUDA_R_32F;
+    auto CUDA_R_TYP = std::is_same_v<scalar_t, double> ? CUDA_R_64F : CUDA_R_32F;
     TORCH_CUDSS_CHECK(cudssMatrixCreateDn(&b_mt, b.size(0), 1, b.size(0), b_ptr, CUDA_R_TYP, CUDSS_LAYOUT_COL_MAJOR));
     TORCH_CUDSS_CHECK(cudssMatrixCreateDn(&x_mt, x.size(0), 1, x.size(0), x_ptr, CUDA_R_TYP, CUDSS_LAYOUT_COL_MAJOR));
     TORCH_CUDSS_CHECK(cudssMatrixCreateCsr(&A_mt, A.size(0), A.size(1),  A._nnz(), rowOffsets, rowOffsets + crow.size(0), colIndices, values_ptr, CUDA_R_32I, CUDA_R_TYP, CUDSS_MTYPE_GENERAL, CUDSS_MVIEW_FULL, CUDSS_BASE_ZERO));
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 8755f84cea410..1fa25dad02df0 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -207,10 +207,10 @@ struct CusparseMatrixMultiplyOp {
 
   CusparseMatrixMultiplyOp() {
     static_assert(
-      std::is_same<c10::Half, scalar_t>::value ||
-          std::is_same<c10::BFloat16, scalar_t>::value ||
-          std::is_same<float, scalar_t>::value ||
-          std::is_same<double, scalar_t>::value ||
+      std::is_same_v<c10::Half, scalar_t> ||
+          std::is_same_v<c10::BFloat16, scalar_t> ||
+          std::is_same_v<float, scalar_t> ||
+          std::is_same_v<double, scalar_t> ||
           std::is_same<c10::complex<float>, scalar_t>::value ||
           std::is_same<c10::complex<double>, scalar_t>::value,
       "cusparseSpGEMM only supports data type of half, bfloat16, float, double and complex float, double.");
@@ -669,10 +669,10 @@ void sparse_sparse_matmul_cuda_kernel(
     const Tensor& mat2) {
 
   static_assert(
-    std::is_same<c10::Half, scalar_t>::value ||
-        std::is_same<c10::BFloat16, scalar_t>::value ||
-        std::is_same<float, scalar_t>::value ||
-        std::is_same<double, scalar_t>::value ||
+    std::is_same_v<c10::Half, scalar_t> ||
+        std::is_same_v<c10::BFloat16, scalar_t> ||
+        std::is_same_v<float, scalar_t> ||
+        std::is_same_v<double, scalar_t> ||
         std::is_same<c10::complex<float>, scalar_t>::value ||
         std::is_same<c10::complex<double>, scalar_t>::value,
     "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
index 9f8fc2ca5a160..72d215bb68dab 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu
@@ -211,8 +211,8 @@ void spgemm_cutlass(
 
     AlphaArguments alpha_arguments{
         [&]() -> AlphaArguments {
-            if constexpr (std::is_same<ElementComputeEpilogue, cutlass::half_t>::value ||
-                          std::is_same<ElementComputeEpilogue, cutlass::bfloat16_t>::value) {
+            if constexpr (std::is_same_v<ElementComputeEpilogue, cutlass::half_t> ||
+                          std::is_same_v<ElementComputeEpilogue, cutlass::bfloat16_t>) {
                 return {ElementComputeEpilogue{alpha.to<float>()}};
             } else {
                 return {alpha.to<ElementComputeEpilogue>()};
@@ -221,8 +221,8 @@ void spgemm_cutlass(
     };
     BetaArguments beta_arguments{
         [&]() -> BetaArguments {
-            if constexpr (std::is_same<ElementComputeEpilogue, cutlass::half_t>::value ||
-                          std::is_same<ElementComputeEpilogue, cutlass::bfloat16_t>::value) {
+            if constexpr (std::is_same_v<ElementComputeEpilogue, cutlass::half_t> ||
+                          std::is_same_v<ElementComputeEpilogue, cutlass::bfloat16_t>) {
                 return {ElementComputeEpilogue{beta.to<float>()}};
             } else {
                 return {beta.to<ElementComputeEpilogue>()};
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h b/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h
index 8dc4b0b22bcc9..a40815575ff94 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/dropout.h
@@ -55,7 +55,7 @@ struct Dropout {
                 // We're exploiting the fact that floating point comparison is equivalent to integer
                 // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
                 if (!encode_dropout_in_sign_bit
-                    && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
+                    && (std::is_same_v<T, cutlass::half_t> || std::is_same_v<T, cutlass::bfloat16_t>)) {
                     uint16_t rnd_16[16];
                     #pragma unroll
                     for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
index 9bf561d26dd76..3de24290775a8 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h
@@ -884,31 +884,31 @@ template <typename T> void dispatch_cutlassB_f32_sm80(T cb, int cc) {
 template <typename DT, typename T>
 void dispatch_cutlassB(T cb, int cc = 0) {
 
-    if (std::is_same<DT, cutlass::half_t>::value && 70 <= cc && cc < 75) {
+    if (std::is_same_v<DT, cutlass::half_t> && 70 <= cc && cc < 75) {
         dispatch_cutlassB_f16_sm70(cb, cc);
     }
-    if (std::is_same<DT, cutlass::bfloat16_t>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, cutlass::bfloat16_t> && 80 <= cc && cc < 100) {
         dispatch_cutlassB_bf16_sm80(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, cutlass::half_t> && 80 <= cc && cc < 100) {
         dispatch_cutlassB_f16_sm80(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 50 <= cc && cc < 70) {
+    if (std::is_same_v<DT, cutlass::half_t> && 50 <= cc && cc < 70) {
         dispatch_cutlassB_f16_sm50(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 50 <= cc && cc < 70) {
+    if (std::is_same_v<DT, float> && 50 <= cc && cc < 70) {
         dispatch_cutlassB_f32_sm50(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 70 <= cc && cc < 75) {
+    if (std::is_same_v<DT, float> && 70 <= cc && cc < 75) {
         dispatch_cutlassB_f32_sm70(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 75 <= cc && cc < 80) {
+    if (std::is_same_v<DT, cutlass::half_t> && 75 <= cc && cc < 80) {
         dispatch_cutlassB_f16_sm75(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 75 <= cc && cc < 80) {
+    if (std::is_same_v<DT, float> && 75 <= cc && cc < 80) {
         dispatch_cutlassB_f32_sm75(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, float> && 80 <= cc && cc < 100) {
         dispatch_cutlassB_f32_sm80(cb, cc);
     }
 }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h
index c8e38916501ea..fb3b48b5f838b 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h
@@ -283,31 +283,31 @@ template <typename T> void dispatch_cutlassF_f32_sm80(T cb, int cc) {
 template <typename DT, typename T>
 void dispatch_cutlassF(T cb, int cc = 0) {
 
-    if (std::is_same<DT, cutlass::bfloat16_t>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, cutlass::bfloat16_t> && 80 <= cc && cc < 100) {
         dispatch_cutlassF_bf16_sm80(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 50 <= cc && cc < 70) {
+    if (std::is_same_v<DT, cutlass::half_t> && 50 <= cc && cc < 70) {
         dispatch_cutlassF_f16_sm50(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 70 <= cc && cc < 75) {
+    if (std::is_same_v<DT, cutlass::half_t> && 70 <= cc && cc < 75) {
         dispatch_cutlassF_f16_sm70(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 75 <= cc && cc < 80) {
+    if (std::is_same_v<DT, cutlass::half_t> && 75 <= cc && cc < 80) {
         dispatch_cutlassF_f16_sm75(cb, cc);
     }
-    if (std::is_same<DT, cutlass::half_t>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, cutlass::half_t> && 80 <= cc && cc < 100) {
         dispatch_cutlassF_f16_sm80(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 50 <= cc && cc < 70) {
+    if (std::is_same_v<DT, float> && 50 <= cc && cc < 70) {
         dispatch_cutlassF_f32_sm50(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 70 <= cc && cc < 75) {
+    if (std::is_same_v<DT, float> && 70 <= cc && cc < 75) {
         dispatch_cutlassF_f32_sm70(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 75 <= cc && cc < 80) {
+    if (std::is_same_v<DT, float> && 75 <= cc && cc < 80) {
         dispatch_cutlassF_f32_sm75(cb, cc);
     }
-    if (std::is_same<DT, float>::value && 80 <= cc && cc < 100) {
+    if (std::is_same_v<DT, float> && 80 <= cc && cc < 100) {
         dispatch_cutlassF_f32_sm80(cb, cc);
     }
 }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
index 0fb723d9d8fac..d056eb223148b 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@@ -352,7 +352,7 @@ def write_decl_impl(
             declarations += f"    {_call}"
         declarations += "}\n\n"
         dispatch_all += f"""
-    if (std::is_same<DT, {DTYPES[cat_dt]}>::value && {cat_sm} <= cc && cc < {cat_sm_max}) {{
+    if (std::is_same_v<DT, {DTYPES[cat_dt]}> && {cat_sm} <= cc && cc < {cat_sm_max}) {{
         {dispatch_category_fn}(cb, cc);
     }}"""
 
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index bdc7a95a31b38..3172c9c461079 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -197,7 +197,7 @@ inline constexpr To safe_downcast(const From& v) {
 
 template <typename To, typename From>
 inline constexpr bool is_signed_to_unsigned() {
-  return std::is_signed<From>::value && std::is_unsigned<To>::value;
+  return std::is_signed_v<From> && std::is_unsigned_v<To>;
 }
 
 } // namespace detail
diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h
index db2e2616a306c..d51fd9ef6a867 100644
--- a/aten/src/ATen/test/vec_test_all_types.h
+++ b/aten/src/ATen/test/vec_test_all_types.h
@@ -780,13 +780,13 @@ class TestCaseBuilder {
 };
 
 template <typename T>
-typename std::enable_if_t<!is_complex<T>::value&& std::is_unsigned<T>::value, T>
+typename std::enable_if_t<!is_complex<T>::value&& std::is_unsigned_v<T>, T>
 correctEpsilon(const T& eps)
 {
     return eps;
 }
 template <typename T>
-typename std::enable_if_t<!is_complex<T>::value && !std::is_unsigned<T>::value, T>
+typename std::enable_if_t<!is_complex<T>::value && !std::is_unsigned_v<T>, T>
 correctEpsilon(const T& eps)
 {
     return std::abs(eps);
diff --git a/c10/core/DeviceArray.h b/c10/core/DeviceArray.h
index e187f5a669db5..d9d4c72d48cd6 100644
--- a/c10/core/DeviceArray.h
+++ b/c10/core/DeviceArray.h
@@ -11,7 +11,7 @@ class DeviceArray {
  public:
   DeviceArray(c10::Allocator& allocator, size_t size)
       : data_ptr_(allocator.allocate(size * sizeof(T))) {
-    static_assert(std::is_trivial<T>::value, "T must be a trivial type");
+    static_assert(std::is_trivial_v<T>, "T must be a trivial type");
     TORCH_INTERNAL_ASSERT(
         0 == (reinterpret_cast<intptr_t>(data_ptr_.get()) % alignof(T)),
         "c10::DeviceArray: Allocated memory is not aligned for this data type");
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 888881ac2d74d..1c7add8cbfcc8 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -2322,7 +2322,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // Check it here statically - otherwise TypeMeta would throw the runtime
     // error in attempt to invoke TypeMeta::ctor()
     static_assert(
-        std::is_default_constructible<T>::value,
+        std::is_default_constructible_v<T>,
         "Tensor can't hold non-default-constructable types");
     return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
   }
diff --git a/c10/test/util/string_view_test.cpp b/c10/test/util/string_view_test.cpp
index 59b956481351d..8577bc3595595 100644
--- a/c10/test/util/string_view_test.cpp
+++ b/c10/test/util/string_view_test.cpp
@@ -37,17 +37,13 @@ using testutils::expectThrows;
 using testutils::string_equal;
 
 namespace test_typedefs {
-static_assert(std::is_same<char, string_view::value_type>::value, "");
-static_assert(std::is_same<char*, string_view::pointer>::value, "");
-static_assert(std::is_same<const char*, string_view::const_pointer>::value, "");
-static_assert(std::is_same<char&, string_view::reference>::value, "");
-static_assert(
-    std::is_same<const char&, string_view::const_reference>::value,
-    "");
-static_assert(std::is_same<std::size_t, string_view::size_type>::value, "");
-static_assert(
-    std::is_same<std::ptrdiff_t, string_view::difference_type>::value,
-    "");
+static_assert(std::is_same_v<char, string_view::value_type>, "");
+static_assert(std::is_same_v<char*, string_view::pointer>, "");
+static_assert(std::is_same_v<const char*, string_view::const_pointer>, "");
+static_assert(std::is_same_v<char&, string_view::reference>, "");
+static_assert(std::is_same_v<const char&, string_view::const_reference>, "");
+static_assert(std::is_same_v<std::size_t, string_view::size_type>, "");
+static_assert(std::is_same_v<std::ptrdiff_t, string_view::difference_type>, "");
 } // namespace test_typedefs
 
 namespace test_default_constructor {
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index c977d7e92b2a6..bd1405c1fc652 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -114,7 +114,7 @@ class ArrayRef final {
   /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
       : Data(Vec.data()), Length(Vec.size()) {
     static_assert(
-        !std::is_same<T, bool>::value,
+        !std::is_same_v<T, bool>,
         "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
   }
 
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index d45b8c8616f5f..0b5282c9b9e64 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -842,7 +842,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     // If we just moved the element we're inserting, be sure to update
     // the reference (never happens if TakesParamByValue).
     static_assert(
-        !TakesParamByValue || std::is_same<ArgType, T>::value,
+        !TakesParamByValue || std::is_same_v<ArgType, T>,
         "ArgType must be 'T' when taking by value!");
     if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
       ++EltPtr;
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 8f50e91d8295c..e1d551930e162 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -232,7 +232,7 @@ class intrusive_ptr final {
 //  the target class T to be fully defined when intrusive_ptr<T> is instantiated
 //  this is a problem for classes that contain pointers to themselves
 //  static_assert(
-//      std::is_base_of<intrusive_ptr_target, TTarget>::value,
+//      std::is_base_of_v<intrusive_ptr_target, TTarget>,
 //      "intrusive_ptr can only be used for classes that inherit from
 //      intrusive_ptr_target.");
 #ifndef _WIN32
@@ -354,7 +354,7 @@ class intrusive_ptr final {
       : target_(
             detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
     rhs.target_ = FromNullType::singleton();
   }
@@ -368,7 +368,7 @@ class intrusive_ptr final {
       : target_(
             detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type.");
     retain_();
   }
@@ -385,7 +385,7 @@ class intrusive_ptr final {
   template <class From, class FromNullType>
   intrusive_ptr& operator=(intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. intrusive_ptr move assignment got pointer of wrong type.");
     intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
@@ -404,7 +404,7 @@ class intrusive_ptr final {
   intrusive_ptr& operator=(
       const intrusive_ptr<From, NullType>& rhs) & noexcept {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type.");
     intrusive_ptr tmp = rhs;
     swap(tmp);
@@ -743,7 +743,7 @@ class weak_intrusive_ptr final {
       : target_(
             detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type.");
     rhs.target_ = FromNullType::singleton();
   }
@@ -758,7 +758,7 @@ class weak_intrusive_ptr final {
       : target_(
             detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type.");
     retain_();
   }
@@ -776,7 +776,7 @@ class weak_intrusive_ptr final {
   weak_intrusive_ptr& operator=(
       weak_intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type.");
     weak_intrusive_ptr tmp = std::move(rhs);
     swap(tmp);
@@ -802,7 +802,7 @@ class weak_intrusive_ptr final {
   weak_intrusive_ptr& operator=(
       const weak_intrusive_ptr<From, NullType>& rhs) & noexcept {
     static_assert(
-        std::is_convertible<From*, TTarget*>::value,
+        std::is_convertible_v<From*, TTarget*>,
         "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type.");
     weak_intrusive_ptr tmp = rhs;
     swap(tmp);
diff --git a/c10/util/strong_type.h b/c10/util/strong_type.h
index 8b2a88ea1d90c..1399c27c7d186 100644
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@@ -46,7 +46,7 @@ namespace strong
 namespace impl
 {
   template <typename T, typename ... V>
-  using WhenConstructible = std::enable_if_t<std::is_constructible<T, V...>::value>;
+  using WhenConstructible = std::enable_if_t<std::is_constructible_v<T, V...>>;
 }
 
 template <typename M, typename T>
@@ -101,18 +101,18 @@ class type : public modifier<M, type<T, Tag, M...>>...
   {
   }
   template <typename ... U,
-            typename = std::enable_if_t<std::is_constructible<T, U&&...>::value && (sizeof...(U) > 0)>>
+            typename = std::enable_if_t<std::is_constructible_v<T, U&&...> && (sizeof...(U) > 0)>>
   constexpr
   explicit
   type(
     U&& ... u)
-  noexcept(std::is_nothrow_constructible<T, U...>::value)
+  noexcept(std::is_nothrow_constructible_v<T, U...>)
   : val(std::forward<U>(u)...)
   {}
 
   friend constexpr void swap(type& a, type& b) noexcept(
-                                                        std::is_nothrow_move_constructible<T>::value &&
-                                                        std::is_nothrow_move_assignable<T>::value
+                                                        std::is_nothrow_move_constructible_v<T> &&
+                                                        std::is_nothrow_move_assignable_v<T>
                                                       )
   {
     using std::swap;
@@ -820,7 +820,7 @@ class affine_point<D>::modifier<::strong::type<T, Tag, M...>>
   using base_diff_type = decltype(std::declval<const T&>() - std::declval<const T&>());
 public:
   using difference = std::conditional_t<std::is_same<D, void>{}, strong::type<base_diff_type, Tag, strong::difference>, D>;
-  static_assert(std::is_constructible<difference, base_diff_type>::value, "");
+  static_assert(std::is_constructible_v<difference, base_diff_type>, "");
   [[nodiscard]]
   friend
   constexpr
diff --git a/caffe2/perfkernels/embedding_lookup_idx.cc b/caffe2/perfkernels/embedding_lookup_idx.cc
index 7c62d9e883fd6..76be1201d589e 100644
--- a/caffe2/perfkernels/embedding_lookup_idx.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx.cc
@@ -129,7 +129,7 @@ static bool EmbeddingLookupGenericSlowIdx(
           const float* scale_bias,                                                                    \
           bool normalize_by_lengths,                                                                  \
           OutType* out) {                                                                             \
-    if constexpr (std::is_same<InType, uint8_t>::value) {                                             \
+    if constexpr (std::is_same_v<InType, uint8_t>) {                                             \
       CAFFE_ENFORCE(scale_bias != nullptr, "scale_bias must not be nullptr");                         \
     } else {                                                                                          \
       CAFFE_ENFORCE(scale_bias == nullptr, "scale_bias must be nullptr");                             \
diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp
index 4ff71682da1a2..d174c02ec927d 100644
--- a/test/cpp/api/static.cpp
+++ b/test/cpp/api/static.cpp
@@ -68,8 +68,7 @@ template <typename Module, typename ExpectedType, typename... Args>
 void assert_has_expected_type() {
   using ReturnType =
       typename torch::detail::return_type_of_forward<Module, Args...>::type;
-  constexpr bool is_expected_type =
-      std::is_same<ReturnType, ExpectedType>::value;
+  constexpr bool is_expected_type = std::is_same_v<ReturnType, ExpectedType>;
   ASSERT_TRUE(is_expected_type) << Module().name();
 }
 
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index c914c6a7338bd..5886ac3628d85 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -2395,7 +2395,7 @@ class CppPythonBindingsCodeCache(CppCodeCache):
         static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);
 
         template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
-            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
+            static_assert(std::is_pointer_v<T>, "arg type must be pointer or long");
             return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
         }
         template <> inline int64_t parse_arg<int64_t>(PyObject* args, size_t n) {
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 0ae57c7d4c649..0b7ae5ea27a8a 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -537,7 +537,7 @@ Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::VectorizedN<scalar_t,
 #endif
 
 
-template <typename T, typename U> inline typename std::common_type<T, U>::type mod(T a, U b) { return a % b; }
+template <typename T, typename U> inline typename std::common_type_t<T, U> mod(T a, U b) { return a % b; }
 template <> inline float mod(float a, float b) { return std::fmod(a, b); }
 template <> inline double mod(double a, double b) { return std::fmod(a, b); }
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/any.h b/torch/csrc/api/include/torch/nn/modules/container/any.h
index 89b14f0d4e893..28f297388757b 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -205,7 +205,7 @@ template <typename ModuleType>
 AnyModule::AnyModule(std::shared_ptr<ModuleType> module)
     : content_(make_holder(
           std::move(module),
-          &std::remove_reference<ModuleType>::type::forward)) {
+          &std::remove_reference_t<ModuleType>::forward)) {
   // `AnyModule` can only store an `nn::Module` subclass object that provides
   // a `forward()` method that has a non-templatized return type.
   // (e.g. `AnyModule` cannot store `nn::Sequential`, because `nn::Sequential`'s
diff --git a/torch/csrc/api/include/torch/nn/modules/container/named_any.h b/torch/csrc/api/include/torch/nn/modules/container/named_any.h
index 542471f61f2df..9b7c01b08e9cf 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/named_any.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/named_any.h
@@ -39,7 +39,7 @@ class NamedAnyModule {
 
   /// Creates a `NamedAnyModule` from a `Module`, moving or copying it
   /// into a `shared_ptr` internally.
-  // NOTE: We need to use `std::remove_reference<M>::type` to get rid of
+  // NOTE: We need to use `std::remove_reference_t<M>` to get rid of
   // any reference components for make_unique.
   template <typename M, typename = torch::detail::enable_if_module_t<M>>
   NamedAnyModule(std::string name, M&& module)
diff --git a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
index afb41ee0bdd09..a01808b013828 100644
--- a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+++ b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -61,7 +61,7 @@ class MiniArrayRef final {
   /* implicit */ MiniArrayRef(const std::vector<T, A>& Vec)
       : Data(Vec.data()), Length(Vec.size()) {
     static_assert(
-        !std::is_same<T, bool>::value,
+        !std::is_same_v<T, bool>,
         "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
   }
 
diff --git a/torch/csrc/jit/backends/backend.h b/torch/csrc/jit/backends/backend.h
index db0205d395ddc..a6b567c85480f 100644
--- a/torch/csrc/jit/backends/backend.h
+++ b/torch/csrc/jit/backends/backend.h
@@ -89,7 +89,7 @@ std::function<void(Stack&)> getExecuteFunc() {
 template <class TBackendInterface>
 class backend {
   static_assert(
-      std::is_base_of<PyTorchBackendInterface, TBackendInterface>::value,
+      std::is_base_of_v<PyTorchBackendInterface, TBackendInterface>,
       "torch::jit::backend<T> requires T to inherit from PyTorchBackendInterface");
   std::string backend_name_;
 
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 324fc37e080c6..493a63b944469 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -64,7 +64,7 @@ struct ArgumentInfo {
 };
 
 static_assert(
-    std::is_standard_layout<ArgumentInfo>::value,
+    std::is_standard_layout_v<ArgumentInfo>,
     "ArgumentInfo is to be a POD struct");
 static_assert(
     sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
@@ -106,7 +106,7 @@ struct ArgumentSpec {
       at::Device device = t->device();
       arg.dev_type_ =
           // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-          static_cast<std::underlying_type<DeviceType>::type>(device.type());
+          static_cast<std::underlying_type_t<DeviceType>>(device.type());
       // NOLINTNEXTLINE(bugprone-signed-char-misuse)
       arg.device_ = device.index();
       arg.type_ = static_cast<unsigned>(t->scalar_type());
@@ -266,8 +266,8 @@ struct CompleteArgumentSpec {
           pod.type = static_cast<int>(t.scalar_type());
           at::Device device = t.device();
           // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-          pod.dev_type = static_cast<std::underlying_type<DeviceType>::type>(
-              device.type());
+          pod.dev_type =
+              static_cast<std::underlying_type_t<DeviceType>>(device.type());
           // NOLINTNEXTLINE(bugprone-signed-char-misuse)
           pod.device = device.index();
           pod.requires_grad = with_grad && t.requires_grad();
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 5fe52a0ff9e0e..90a0513719c92 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -1078,7 +1078,7 @@ template <typename T>
 std::enable_if_t<std::is_integral_v<T>, llvm::Value*> getFromType(
     llvm::Type* type,
     T value) {
-  return llvm::ConstantInt::get(type, value, std::is_signed<T>::value);
+  return llvm::ConstantInt::get(type, value, std::is_signed_v<T>);
 }
 
 template <typename T>
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index bd417a8de5a17..4fcf73e820636 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -297,7 +297,7 @@ static void registerXpuDeviceProperties(PyObject* module) {
         break;
       default:
         stream << "unknown device type:"
-               << static_cast<typename std::underlying_type<device_type>::type>(
+               << static_cast<typename std::underlying_type_t<device_type>>(
                       prop.device_type);
         break;
     }
diff --git a/torchgen/gen.py b/torchgen/gen.py
index ab918577c6160..9d4c94d98b59b 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -691,7 +691,7 @@ def __call__(self, f: NativeFunction) -> str | None:
             if has_symint:
                 result += f"""
 namespace symint {{
-  template <typename T, typename = std::enable_if_t<std::is_same<T, {intlike_t}>::value>>
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, {intlike_t}>>>
   {sig.decl(suppress_symint_suffix=True)} {{
     return at::_ops::{f.func.name.unambiguous_name()}::call({exprs_str});
   }}

From fee17d530d2e2982ad76054dddf661fb2c9860a3 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sun, 27 Oct 2024 00:57:11 +0000
Subject: [PATCH 128/161] [AOTInductor] Add relu_nan_to_num option for pre-grad
 passes (#138545)

Summary: Add a relu_nan_to_num in pre-grad pass.

Test Plan: Included in commit

Differential Revision: D64724780

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138545
Approved by: https://github.com/chenyang78
---
 torch/_inductor/config.py             |  3 +++
 torch/_inductor/fx_passes/pre_grad.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index f39051db75ec7..55563e5dd3521 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1069,6 +1069,9 @@ class aot_inductor:
         os.environ.get("AOTINDUCTOR_RAISE_ERROR_ON_IGNORED_OPTIMIZATION", "1") == "1"
     )
 
+    # Dictionary of presets that can be passed in
+    presets: Dict[str, Any] = {}
+
 
 class cuda:
     # CUDA arch to use for CUDA template kernel compilation.
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index 16a6a74aea146..80e640d54fc26 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -104,6 +104,10 @@ def merge_concats_pass(graph):
     return None
 
 
+def relu_nan_to_num(graph):
+    return None
+
+
 @init_once_fakemode
 def lazy_init():
     from . import efficient_conv_bn_eval, split_cat  # noqa: F401  # noqa: F401
@@ -162,6 +166,12 @@ def shape_prop(mod) -> None:
                 example_inputs,
                 "[Pre grad(predispatch IR)]Apply remove_noop pass",
             )
+            pass_execution_and_save(
+                relu_nan_to_num,
+                gm,
+                example_inputs,
+                "[Pre grad(predispatch IR)]Apply relu_nan_to_num pass",
+            )
             pass_execution_and_save(
                 fuse_chunk_reshape_concat_pass,
                 gm,

From 28013aa5279a26c1d0c47fd30df7045f31df2123 Mon Sep 17 00:00:00 2001
From: Mu-Chu Lee <mlee8@meta.com>
Date: Sun, 27 Oct 2024 01:12:27 +0000
Subject: [PATCH 129/161] [AOTInductor] Disable comprehensive_padding when
 use_runtime_constant_folding=True (#138872)

Summary:
Disable comprehensive_padding when use_runtime_constant_folding=True.
We need to disable the comprehensive padding because it modifies the stride thus the stride information between the constant graph and main graph will differ.

Test Plan:
```
buck2 run mode/opt -c fbcode.platform010_cuda_version=12 -c fbcode.nvcc_arch=a100  caffe2/torch/fb/model_transform/experimental/benchmark:mts_gpu_benchmark -- --model-path=manifold://ads_storage_fblearner/tree/user/facebook/fblearner/predictor/643940255/17/gpu_lowering/input.predictor.disagg.gpu.merge  --lower-backend="AOT_INDUCTOR_EP" --aot-inductor-config="{'max_autotune': True, 'aot_inductor.use_runtime_constant_folding': True}"
```

Reviewed By: 22quinn, henryoier

Differential Revision: D64927546

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138872
Approved by: https://github.com/chenyang78
---
 torch/_inductor/compile_fx.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index bccbbb38eb918..0b9bf0c8b6f27 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -447,6 +447,11 @@ def maybe_disable_comprehensive_padding(
     if config.disable_padding_cpu and config.comprehensive_padding and not has_gpu:
         perf_hint_log.info("Skip comprehensive padding on CPU")
         return config.patch(comprehensive_padding=False)
+    elif config.aot_inductor.use_runtime_constant_folding:
+        perf_hint_log.info(
+            "Skip comprehensive padding for use_runtime_constant_folding"
+        )
+        return config.patch(comprehensive_padding=False)
     else:
         return contextlib.nullcontext()
 

From 14a45d77931670072d3773dfbc8d12ff73c252af Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Sat, 26 Oct 2024 20:05:38 -0700
Subject: [PATCH 130/161] Refactor core algorithm for automatic dynamic shapes
 (#138717)

While working on automatic dynamic PGO (https://github.com/pytorch/pytorch/pull/138052) one abstract property I was looking for out of profile information is that it formed a semilattice: I could join together two profiles and get a merged profile that is consistent with the profiles that I saw in both cases. While working on this data structure that supported joins, I realized that the base automatic dynamic algorithm could be implemented in this way, therefore this refactor.

The basic recipe is that we now support a join operation on FrameStateSizeEntry. Intuitively, if you join two sizes that are equal, you get back that size (join(2, 2) == 2), but if you join two different sizes you get a special singleton auto_dynamic indicating that the size of the tensor is dynamic (join(2, 3) == auto_dynamic). So now, the automatic dynamic algorithm is: (1) compute the FrameStateSizeEntry that corresponds to the concrete values we've seen, and (2) join it into the ambient FrameStateSizeEntry. As a bonus, compiler collectives can buy into the same abstraction (we're simply distributing FrameStateSizeEntry from each node to every other node). For convenience, I also added the necessary `auto_unset` extra state which is the identity element (which makes our semilattice bounded from both top and bottom). Here, join(2, auto_unset) == 2.

While doing this, there was a complication: the infer stride algorithm wasn't technically a semilattice. Here, I did what I suggested in the original code review https://github.com/pytorch/pytorch/pull/130232 which is stop using a heuristic, and instead replicate the stride inference algorithm in automatic dynamic. This means that when I join strides together, I don't join their concrete values, instead, if a stride can be inferred as the contiguous stride for a particular inner dimension, then you represent it as InferStride(dim). There's an example in code which I recommend looking at.

Some other extra things that are happening in this PR:

* I tried to deduplicate the size/stride automatic dynamic logic as much as possible. So hopefully less code to review here.
* I had to reimplement all the logging. For the most part I tried to track the logging as closely to the original as possible, but I think we could be emitting less Chrome events here
* The `marked_dynamic` handling is still preserved as is, but I kind of don't like it and we should figure out how to put it somewhere else

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138717
Approved by: https://github.com/bobrenjc93
ghstack dependencies: #138693
---
 torch/_dynamo/pgo.py               | 298 +++++++++++++++++++++++++++++
 torch/_dynamo/symbolic_convert.py  |   7 +-
 torch/_dynamo/variables/builder.py | 275 +++++++-------------------
 3 files changed, 371 insertions(+), 209 deletions(-)
 create mode 100644 torch/_dynamo/pgo.py

diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
new file mode 100644
index 0000000000000..a4b5664848090
--- /dev/null
+++ b/torch/_dynamo/pgo.py
@@ -0,0 +1,298 @@
+from __future__ import annotations
+
+import copy
+import dataclasses
+import enum
+import logging
+import time
+from typing import Optional, Tuple, TYPE_CHECKING, TypeVar, Union
+from typing_extensions import Self
+
+from torch._dynamo.utils import get_chromium_event_logger
+
+
+if TYPE_CHECKING:
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+
+
+log = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass(frozen=True)
+class InferStride:
+    """
+    Denotes the quantity stride[dim] * size[dim], which is what the stride would
+    be for the next physical dimension that results in a contiguous layout.
+
+    For example, given size = [2, 3], stride = [3, 1], we can replace this with
+    stride = [InferStride(1), 1], because InferStride(1) = stride[1] * size[1] = 1 * 3 = 3
+
+    Indirecting the representation in this way is important for the join operation
+    on strides as if we join [2, 3][3, 1] and [2, 4][4, 1],
+    we don't want [2, None][None, 1] which would get eventually symbolized into
+    [2, s0][s1, 1] (notice that the relationship between s0 and s1 is broken).
+    If we instead rewrite the expressions as InferStride so we have [2, 3][InferStride(1), 1]
+    and [2, 4][InferStride(1), 1] we now join to [2, None][InferStride(1), 1] will
+    result in [2, s0][s0, 1], as desired.
+    """
+
+    dim: int
+
+
+_T = TypeVar("_T")
+
+
+class AutoUnset(enum.Enum):
+    """
+    The identity element of our semilattice, a generic "don't know" element that
+    is always subsumed when we get more information.
+    """
+
+    token = 0
+
+
+auto_unset = AutoUnset.token
+
+
+class AutoDynamic(enum.Enum):
+    """
+    The top element of our (bounded) semilattice, whenever you merge this with
+    any other element you always get it again
+    """
+
+    token = 0
+
+
+auto_dynamic = AutoDynamic.token
+
+
+@dataclasses.dataclass
+class FrameStateSizeEntry:
+    scalar: Union[int, AutoDynamic, AutoUnset] = dataclasses.field(default=auto_unset)
+    # NB: We don't have cases where we have a known dimensionality but
+    # we know NOTHING about the individual sizes
+    size: Union[
+        AutoDynamic, AutoUnset, Tuple[Union[int, AutoDynamic], ...]
+    ] = dataclasses.field(default=auto_unset)
+    stride: Union[
+        AutoDynamic, AutoUnset, Tuple[Union[int, AutoDynamic, InferStride], ...]
+    ] = dataclasses.field(default=auto_unset)
+
+    def is_size_dynamic(self, dim: int) -> bool:
+        if self.size is auto_dynamic:
+            return True
+        if self.size is auto_unset:
+            return False
+        return self.size[dim] is auto_dynamic
+
+    def is_stride_dynamic(self, dim: int) -> bool:
+        # At the moment, dynamic strides is a bit buggy.  Good test case
+        # here is `PYTORCH_TEST_WITH_DYNAMO=1 python test/test_autograd.py
+        # TestAutograd.test_gradcheck_jacobian_mismatch`
+        #
+        # This if statement preserves historical behavior, which is that we
+        # ONLY make strides dynamic if the size is exactly static everywhere.
+        # We could potentially relax this but in general we should be very
+        # careful about when to infer dynamic strides.
+        #
+        # Actually, the existing algorithm is already somewhat problematic.
+        # Suppose a tensor that is sometimes:
+        # f32[2, 3, 5][15, 5, 1] and other times
+        # f32[2, 3, 5][5, 10, 1] (specifically, dim 0 and 1 are physically transposed).
+        # If we infer strides should be (DYNAMIC, DYNAMIC, 1).  But this is
+        # silly: we really should have just guarded on dim order.
+        if not (
+            isinstance(self.size, tuple) and all(type(s) is int for s in self.size)
+        ):
+            return False
+        if self.stride is auto_dynamic:
+            return True
+        if self.stride is auto_unset:
+            return False
+        return self.stride[dim] is auto_dynamic
+
+    @staticmethod
+    def make_scalar(x: int) -> FrameStateSizeEntry:
+        return FrameStateSizeEntry(scalar=x, size=auto_dynamic, stride=auto_dynamic)
+
+    # NB: steals the inputs
+    @staticmethod
+    def make_tensor(
+        size: Tuple[int, ...], stride: Tuple[int, ...]
+    ) -> FrameStateSizeEntry:
+        return FrameStateSizeEntry(scalar=auto_dynamic, size=size, stride=stride)
+
+    @staticmethod
+    def _merge_atom(x: _T, y: _T) -> Union[AutoDynamic, _T]:
+        if x is auto_unset:
+            return y
+        if y is auto_unset:
+            return x
+        if x is auto_dynamic or y is auto_dynamic or x != y:
+            return auto_dynamic
+        return x
+
+    @classmethod
+    def _merge_atom_tup(
+        cls,
+        xs: Union[AutoDynamic, AutoUnset, Tuple[_T, ...]],
+        ys: Union[AutoDynamic, AutoUnset, Tuple[_T, ...]],
+    ) -> Union[AutoDynamic, AutoUnset, Tuple[Union[AutoDynamic, _T], ...]]:
+        if xs is auto_unset:
+            return ys
+        if ys is auto_unset:
+            return xs
+        if xs is auto_dynamic or ys is auto_dynamic:
+            return auto_dynamic
+        if len(xs) != len(ys):
+            return auto_dynamic
+        return tuple(cls._merge_atom(x, y) for x, y in zip(xs, ys))
+
+    def __ior__(self, other: Self) -> Self:
+        self.scalar = self._merge_atom(self.scalar, other.scalar)
+        self.size = self._merge_atom_tup(self.size, other.size)
+        self.stride = self._merge_atom_tup(self.stride, other.stride)
+        return self
+
+
+def update_automatic_dynamic(
+    tx: InstructionTranslator,
+    name: str,
+    entry: FrameStateSizeEntry,
+    *,
+    is_unspecialized_nn_module: bool = False,
+) -> FrameStateSizeEntry:
+    is_update = name in tx.output.frame_state
+    mut_entry = tx.output.frame_state.setdefault(name, FrameStateSizeEntry())
+    old_entry = copy.copy(mut_entry)
+    mut_entry |= entry
+
+    # Do some logs (damn, I spend more code logging than I do actually doing
+    # the updates lol)
+    if is_update and old_entry.scalar != mut_entry.scalar:
+        log.debug(
+            "automatic dynamic int %s val %s != %s",
+            name,
+            entry.scalar,
+            old_entry.scalar,
+        )
+        get_chromium_event_logger().log_instant_event(
+            "automatic_dynamic",
+            time.time_ns(),
+            {
+                "name": name,
+                "dim_changed": "scalar",
+                "reason": "scalar change",
+                "cached": str(old_entry.scalar),
+                "new": str(entry.scalar),
+            },
+        )
+        if is_unspecialized_nn_module:
+            log.info(
+                "%s is converted to a symbolic integer. It is an attribute of a "
+                "user defined nn module class. If you wish to keep it static, you can "
+                "mark the nn module class as `torch._dynamo.mark_static`.",
+                name,
+            )
+
+    def log_tup(
+        tup_name: str, short_reason: str, long_reason: str, i: Optional[int] = None
+    ) -> None:
+        entry_tup = (
+            getattr(entry, tup_name) if i is None else getattr(entry, tup_name)[i]
+        )
+        old_entry_tup = (
+            getattr(old_entry, tup_name)
+            if i is None
+            else getattr(old_entry, tup_name)[i]
+        )
+        log.debug(
+            "automatic dynamic %s %s %s %s != %s",
+            tup_name,
+            name,
+            short_reason,
+            # NB: We used to only report len(...) here for dim mismatch
+            entry_tup,
+            old_entry_tup,
+        )
+        get_chromium_event_logger().log_instant_event(
+            "automatic_dynamic",
+            time.time_ns(),
+            {
+                "name": name,
+                "dim_changed": "all" if i is None else i,
+                "reason": long_reason,
+                "cached": str(old_entry_tup),
+                "new": str(entry_tup),
+            },
+        )
+
+    if is_update and old_entry.size != mut_entry.size:
+        if isinstance(old_entry.size, tuple) and isinstance(entry.size, tuple):
+            if len(old_entry.size) != len(entry.size):
+                log_tup("size", "dim", "dimensionality change")
+            else:
+                for i in range(len(entry.size)):
+                    if old_entry.size[i] != entry.size[i]:
+                        log_tup("size", f"size({i})", "size change", i)
+        else:
+            log_tup("size", "other", "other")
+
+    if is_update and old_entry.stride != mut_entry.stride:
+        if isinstance(old_entry.stride, tuple) and isinstance(entry.stride, tuple):
+            if len(old_entry.stride) != len(entry.stride):
+                log_tup("stride", "dim", "dimensionality change")
+            else:
+                for i in range(len(entry.stride)):
+                    if old_entry.stride[i] != entry.stride[i]:
+                        log_tup("stride", f"stride({i})", "stride change", i)
+        else:
+            log_tup("stride", "other", "other")
+
+    return mut_entry
+
+
+def process_automatic_dynamic(
+    tx: InstructionTranslator,
+    name: str,
+    entry: FrameStateSizeEntry,
+    *,
+    is_unspecialized_nn_module: bool = False,
+) -> FrameStateSizeEntry:
+    if (st := tx.distributed_state) is None:
+        return update_automatic_dynamic(
+            tx,
+            name,
+            entry,
+            is_unspecialized_nn_module=is_unspecialized_nn_module,
+        )
+    elif st.all_states is None:
+        # Preflight, always pretend as if it's static.  The point here
+        # is we want to get through the preflight quickly, and static
+        # will run faster.  The preexisting frame state will get
+        # applied anyway after we do compiler collectives.
+        # TODO: I'm not sure if we should just bong the entire pgo
+        # state here, it kind of depends if we're going to have other
+        # things that talk in compiler collective.  Also, the PGO
+        # state, if we've already inferred something is automatic
+        # dynamic, will have lost the actual input sizes, which might
+        # be useful for debugging purposes (e.g., observing 0/1
+        # specialization).  Bonging the entire PGO state here would
+        # let us delete this logic here; the compiler collective
+        # would just directly update_automatic_dynamic
+        st.local_state.automatic_dynamic[name] = entry
+        return entry
+    else:
+        # Apply the updates.  NB: all_states includes the local state
+        # too.
+        res = None
+        for sub_state in st.all_states:
+            if name in sub_state.automatic_dynamic:
+                res = update_automatic_dynamic(
+                    tx,
+                    name,
+                    sub_state.automatic_dynamic[name],
+                    is_unspecialized_nn_module=is_unspecialized_nn_module,
+                )
+        assert res is not None
+        return res
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 09c2c59e60944..869495bbf832f 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -71,7 +71,7 @@
     proxy_args_kwargs,
 )
 from .variables.base import MutableLocal, typestr, VariableTracker
-from .variables.builder import wrap_fx_proxy
+from .variables.builder import FrameStateSizeEntry, wrap_fx_proxy
 from .variables.builtin import BuiltinVariable
 from .variables.constant import ConstantVariable
 from .variables.ctx_manager import (
@@ -226,8 +226,9 @@ def next(
 
 @dataclasses.dataclass
 class LocalState:
-    input_sizes: Dict[str, List[int]] = dataclasses.field(default_factory=dict)
-    input_strides: Dict[str, List[int]] = dataclasses.field(default_factory=dict)
+    automatic_dynamic: Dict[str, FrameStateSizeEntry] = dataclasses.field(
+        default_factory=dict
+    )
 
 
 # Mutable box that is shared across restarts
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index a47948dc541f0..20247b49e84d5 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -15,7 +15,6 @@
 import random
 import re
 import sys
-import time
 import types
 import warnings
 import weakref
@@ -35,7 +34,6 @@
 
 import torch
 from torch import SymInt
-from torch._dynamo.utils import get_chromium_event_logger
 from torch._guards import GuardSource, TracingContext
 from torch._higher_order_ops.torchbind import call_torchbind
 from torch._ops import HigherOrderOperator
@@ -45,6 +43,7 @@
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental.symbolic_shapes import (
     _constrain_range_for_size,
+    _nested_int_aware_sort,
     DimDynamic,
     RelaxedUnspecConstraint,
     StatefulSymbolicContext,
@@ -60,6 +59,13 @@
 from ..device_interface import get_registered_device_interfaces
 from ..exc import InternalTorchDynamoError, unimplemented
 from ..guards import GuardBuilder, install_guard, make_dupe_guard
+from ..pgo import (
+    auto_dynamic,
+    auto_unset,
+    FrameStateSizeEntry,
+    InferStride,
+    process_automatic_dynamic,
+)
 from ..side_effects import SideEffects
 from ..source import (
     AttrProxySource,
@@ -333,13 +339,6 @@ def reconstruct(self, codegen):
         codegen.store(codegen.tx.output.backward_state_var)
 
 
-@dataclasses.dataclass
-class FrameStateSizeEntry:
-    scalar: Optional[int]
-    size: Optional[List[int]]
-    stride: Optional[List[int]]
-
-
 # All class-based iterators in itertools
 # NOTE: use id() because some objects are not hashable, it will raise error during lookup
 ITERTOOLS_TYPE_IDS: FrozenSet[int] = frozenset(
@@ -1777,69 +1776,20 @@ def wrap_symint(self, value):
 
             name = self.source.name()
 
-            def update_frame_state(value):
-                if name not in self.tx.output.frame_state:
-                    # Note - this essentially means that if this name gets reused as a tensor,
-                    # it will start fully dynamic. That should always be a safe option, and not awfully inefficient.
-                    # Alternatively, if we want to improve pef here, we can add a third state of unset, but I am not
-                    # sure that is necessary for now.
-                    frame_state_entry = FrameStateSizeEntry(
-                        scalar=value, size=None, stride=None
-                    )
-                else:
-                    frame_state_entry = self.tx.output.frame_state[name]
-                    if frame_state_entry.scalar != value:
-                        log.debug(
-                            "automatic dynamic int %s val %s != %s",
-                            name,
-                            value,
-                            frame_state_entry.scalar,
-                        )
-                        get_chromium_event_logger().log_instant_event(
-                            "automatic_dynamic",
-                            time.time_ns(),
-                            {
-                                "name": name,
-                                "dim_changed": "scalar",
-                                "reason": "scalar change",
-                                "cached": str(frame_state_entry.scalar),
-                                "new": str(value),
-                            },
-                        )
-                        if self.source.guard_source().is_unspecialized_nn_module():
-                            log.info(
-                                "%s",
-                                (
-                                    f"{name} is converted to a symbolic integer. It is an attribute of a "
-                                    "user defined nn module class. If you wish to keep it static, you can "
-                                    "mark the nn module class as `torch._dynamo.mark_static`."
-                                ),
-                            )
-                        frame_state_entry.scalar = None
-                self.tx.output.frame_state[name] = frame_state_entry
-
-            if (st := self.tx.distributed_state) is None:
-                update_frame_state(value)
-                frame_state_entry = self.tx.output.frame_state[name]
-            elif st.all_states is None:
-                # Preflight, always pretend as if it's static
-                frame_state_entry = FrameStateSizeEntry(
-                    size=None, scalar=value, stride=None
-                )
-                st.local_state.input_sizes[name] = value
-            else:
-                # Apply the updates
-                for sub_state in st.all_states:
-                    if name in sub_state.input_sizes:
-                        update_frame_state(sub_state.input_sizes[name])
-                frame_state_entry = self.tx.output.frame_state[name]
+            frame_state_entry = process_automatic_dynamic(
+                self.tx,
+                name,
+                FrameStateSizeEntry.make_scalar(value),
+                is_unspecialized_nn_module=self.source.guard_source().is_unspecialized_nn_module(),
+            )
 
             # TODO: This should be dynamic, as we in general do not
             # know if bare integers are actually going to be sizevars
             # and it is inappropriate to eagerly duck size them with
             # real sizevars
             if (
-                config.automatic_dynamic_shapes and frame_state_entry.scalar is None
+                config.automatic_dynamic_shapes
+                and frame_state_entry.scalar is auto_dynamic
             ) or not config.assume_static_by_default:
                 dynamic_dim = DimDynamic.DYNAMIC
             else:  # assume_static_by_default
@@ -2493,132 +2443,42 @@ def _automatic_dynamic(
         )
 
     # Prep for automatic dynamic
-    def update_frame_state(size, stride):
-        # Intentionally shadow e from parent scope so it is not accidentally
-        # called
-        e = None
-        frame_state_entry = None
-        if name not in tx.output.frame_state:
-            # If there is no entry for this source, add the tensor to frame state with its current static size.
-            # E.g., {} -> {"x": [2, 4]}
-            frame_state_entry = FrameStateSizeEntry(None, None, None)
-            frame_state_entry.size = list(size)
-            frame_state_entry.stride = list(stride)
-        else:
-            frame_state_entry = tx.output.frame_state[name]
-            if frame_state_entry.size is not None:
-                if len(size) != len(frame_state_entry.size):
-                    # If there is already an entry, and the dim mismatches, replace the frame state entry with None.
-                    # E.g. {"x": [2, 3, 4]} -> {"x": None}
-                    log.debug(
-                        "automatic dynamic %s dim %s != %s",
-                        name,
-                        len(size),
-                        frame_state_entry.size,
-                    )
-                    get_chromium_event_logger().log_instant_event(
-                        "automatic_dynamic",
-                        time.time_ns(),
-                        {
-                            "name": name,
-                            "dim_changed": "all",
-                            "reason": "dimensionality change",
-                            "cached": str(frame_state_entry.size),
-                            "new": str(size),
-                        },
-                    )
-                    frame_state_entry.size = None
-                    frame_state_entry.stride = None
-                else:
-                    # If there is already an entry, and the dim matches, for every size/stride in the frame state which
-                    # disagrees with the current static size/stride, replace it with None.
-                    # E.g., {"x": [2, 3]} -> {"x": [2, # None]}
-
-                    has_size_changed = False
-                    for i, dim in enumerate(frame_state_entry.size):
-                        if dim is not None and size[i] != dim:
-                            log.debug(
-                                "automatic dynamic %s size(%s) %s != %s",
-                                name,
-                                i,
-                                size[i],
-                                dim,
-                            )
-                            get_chromium_event_logger().log_instant_event(
-                                "automatic_dynamic",
-                                time.time_ns(),
-                                {
-                                    "name": name,
-                                    "dim_changed": i,
-                                    "reason": "size change",
-                                    "cached": str(dim),
-                                    "new": str(size[i]),
-                                },
-                            )
-                            frame_state_entry.size[i] = None
-                        has_size_changed = (
-                            has_size_changed or frame_state_entry.size[i] is None
-                        )
 
-                    # We want to trigger automatic dynamism when strides change, but we have to think whether stride should
-                    # be INFER_STRIDE or DYNAMIC.
-                    #
-                    # Case 1: if strides change because of size changes, we might not want to allocate a new symbol for
-                    # stride. Lets say we have a tensor (10, 20) and we mark the dim=1 dynamic for size. Resulting size will
-                    # be (10, s0) and stride can be either (s0, 1) or (s1, 1). In most cases, (s0, 1) is preferred because
-                    # users are not changing both size and stride.
-                    #
-                    # Case 2: But for another case, lets suppose the size remains same between the two invocations but stride
-                    # change. In this case, we definitely want to mark the changing stride to be DYNAMIC.
-
-                    # Here, we use a hueristic to simplify determination of dynamic stride. For case 1, we will always
-                    # assume that stride will be inferred (INFER_STRIDE). This might be suboptimal, where user is doing something
-                    # arbitrary size and stride resizing, and we fail to trigger dynamism, but we have not seen any cases
-                    # yet. For case 2, we will mark the changing dimensions DYNAMIC.
-                    if not has_size_changed:
-                        for i, dim in enumerate(frame_state_entry.stride):
-                            if dim is not None and stride[i] != dim:
-                                log.debug(
-                                    "automatic dynamic %s stride(%s) %s != %s",
-                                    name,
-                                    i,
-                                    stride[i],
-                                    dim,
-                                )
-                                get_chromium_event_logger().log_instant_event(
-                                    "automatic_dynamic",
-                                    time.time_ns(),
-                                    {
-                                        "name": name,
-                                        "dim_changed": i,
-                                        "reason": "stride change",
-                                        "cached": str(dim),
-                                        "new": str(stride[i]),
-                                    },
-                                )
-                                frame_state_entry.stride[i] = None
-        tx.output.frame_state[name] = frame_state_entry
-
-    if (st := tx.distributed_state) is None:
-        stride = e.stride() if not is_sparse_any(e) else ()
-        update_frame_state(e.size(), stride)
-        frame_state_entry = tx.output.frame_state[name]
-    elif st.all_states is None:
-        # Preflight, always pretend as if it's static
-        frame_state_entry = FrameStateSizeEntry(
-            size=e.size(), scalar=None, stride=e.stride()
-        )
-        st.local_state.input_sizes[name] = list(e.size())
-        st.local_state.input_strides[name] = list(e.stride())
-    else:
-        # Apply the updates
-        for sub_state in st.all_states:
-            # Not all inputs are necessarily present on all ranks
-            if name in sub_state.input_sizes and name in sub_state.input_strides:
-                update_frame_state(
-                    sub_state.input_sizes[name], sub_state.input_strides[name]
+    # This mimics stride inference algorithm in _create_symbolic_sizes_strides_storage_offset
+    ex_size = e.size()
+    if not is_sparse_any(e):
+        ex_stride = e.stride()
+        dim = e.dim()
+
+        stride = [None] * dim
+        while any(x is None for x in stride):
+            candidates = {
+                ex_size[i] * ex_stride[i]: InferStride(i)
+                for i in range(dim)
+                if stride[i] is not None and ex_stride[i] >= 0
+            }
+            val_list = sorted(
+                [(ex_stride[i], i) for i in range(dim) if stride[i] is None],
+                key=_nested_int_aware_sort,
+            )
+            for _, i in val_list:
+                if stride[i] is None and ex_stride[i] in candidates:
+                    stride[i] = candidates[ex_stride[i]]
+                    candidates[ex_stride[i] * ex_size[i]] = InferStride(i)
+
+            if any(x is None for x in stride):
+                # bind the smallest unbound stride to a new variable
+                val, i = min(
+                    [(ex_stride[i], i) for i in range(dim) if stride[i] is None],
+                    key=_nested_int_aware_sort,
                 )
-        frame_state_entry = tx.output.frame_state[name]
+                stride[i] = val
+    else:
+        stride = []
+
+    frame_state_entry = process_automatic_dynamic(
+        tx, name, FrameStateSizeEntry.make_tensor(tuple(ex_size), tuple(stride))
+    )
 
     # TODO: index export_constraints ahead of time so we don't have to
     # do a linear scan every time here
@@ -2663,28 +2523,31 @@ def update_dim2constraint(dim, constraint_range, name):
         marked_weak_dynamic = i in getattr(e, "_dynamo_weak_dynamic_indices", set())
         marked_static = i in getattr(e, "_dynamo_static_indices", set())
 
+        # Reflect the user directive in the frame_state
+        # For dynamic, apply None always
+        if marked_dynamic:
+            # TODO: This can be batched
+            # TODO: Doing this here is kind of sus, maybe better to set this
+            # up when we initially created the FrameStateSizeEntry to bong
+            # into the mutable state
+            log.debug("automatic dynamic %s marked dynamic", name)
+            mark_size = [auto_unset] * e.dim()
+            mark_size[i] = auto_dynamic
+            frame_state_entry |= FrameStateSizeEntry(size=mark_size)
+
         # NB: both static and dynamic have precedence over
-        automatic_dynamic_size = config.automatic_dynamic_shapes and (
-            frame_state_entry.size is None or frame_state_entry.size[i] is None
+        automatic_dynamic_size = (
+            config.automatic_dynamic_shapes and frame_state_entry.is_size_dynamic(i)
         )
-
-        # if size is None, no need to make stride dynamic
-        automatic_dynamic_stride = config.automatic_dynamic_shapes and (
-            frame_state_entry.size is not None
-            and (
-                frame_state_entry.stride is None or frame_state_entry.stride[i] is None
-            )
+        # NB: previously, if size was dynamic, we wouldn't make its stride
+        # dynamic.  But now, because of InferStride concept, we will properly
+        # not make stride dynamic even if it's wobbling
+        automatic_dynamic_stride = (
+            config.automatic_dynamic_shapes and frame_state_entry.is_stride_dynamic(i)
         )
 
         automatic_dynamic = automatic_dynamic_size or automatic_dynamic_stride
 
-        # Reflect the user directive in the frame_state
-        # For dynamic, apply None always
-        if frame_state_entry.size and marked_dynamic:
-            log.debug("automatic dynamic %s marked dynamic", name)
-            frame_state_entry.size[i] = None
-            frame_state_entry.stride[i] = None
-
         # We will process constraints first, as they will imply that we
         # have a dynamic dimension
         # Precedence: export constraints > eager constraints

From c480a479b13856eb27068fd61a518e243d6536ac Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Sat, 26 Oct 2024 20:05:38 -0700
Subject: [PATCH 131/161] Make automatic_dynamic state live per CodeId, rather
 than on code object (#138740)

This is semantics changing as if you are dealing with multiple code objects which have exactly the same filename/firstlineno/name, but are distinct objects, and need non-aliasing automatic dynamic state. Otherwise, this should be equivalent (modulo lifetime). I want to do this because when I do PGO I can't index on code object identity, need a stable identifier.

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138740
Approved by: https://github.com/bobrenjc93
ghstack dependencies: #138693, #138717
---
 torch/_dynamo/__init__.py |  3 +++
 torch/_dynamo/logging.py  |  2 ++
 torch/_dynamo/pgo.py      | 40 ++++++++++++++++++++++++++++++++++++---
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 4ca47f3d776cb..c3197d751094b 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -33,6 +33,7 @@
 )
 from .external_utils import is_compiling
 from .mutation_guard import GenerationTracker
+from .pgo import CODE_STATE
 from .utils import graph_break_reasons, guard_failures, orig_code_map, reset_frame_count
 
 
@@ -82,6 +83,7 @@ def reset() -> None:
     with convert_frame.compile_lock:
         reset_code_caches()
         convert_frame.input_codes.clear()
+        CODE_STATE.clear()
         convert_frame.output_codes.clear()
         orig_code_map.clear()
         guard_failures.clear()
@@ -102,6 +104,7 @@ def reset() -> None:
 def reset_code_caches() -> None:
     """Clear compile caches that are keyed by code objects"""
     with convert_frame.compile_lock:
+        CODE_STATE.clear()
         for weak_code in (
             convert_frame.input_codes.seen + convert_frame.output_codes.seen
         ):
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 6a4bed8052f09..627c9ff400447 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -54,6 +54,8 @@ def get_step_logger(logger: logging.Logger) -> Callable[..., None]:
     step = next(_step_counter)
 
     def log(level: int, msg: str, **kwargs: Any) -> None:
+        if "stacklevel" not in kwargs:
+            kwargs["stacklevel"] = 2
         logger.log(level, "Step %s: %s", step, msg, **kwargs)
 
     return log
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index a4b5664848090..a71ee07652879 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -5,18 +5,50 @@
 import enum
 import logging
 import time
-from typing import Optional, Tuple, TYPE_CHECKING, TypeVar, Union
+from collections import defaultdict
+from typing import DefaultDict, Optional, Tuple, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import Self
 
 from torch._dynamo.utils import get_chromium_event_logger
 
 
 if TYPE_CHECKING:
+    import types
+
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
 
 log = logging.getLogger(__name__)
 
+# How does in memory representation work?  Concretely, this module is
+# responsible for holding GLOBAL state representing the state it holds, no
+# other copies permitted.  So we retire frame_state entirely and store it
+# here.  This should be reset when Dynamo is reset.  We never GC information
+# (similar to how the filesystem doesn't get cleaned up except by tmp
+# cleaner), so the expectation is the information is relatively cheap and we
+# don't mind leaking it.
+
+
+@dataclasses.dataclass(frozen=True)
+class CodeId:
+    filename: str
+    firstlineno: int
+    name: str
+
+    @staticmethod
+    def make(code: types.CodeType) -> CodeId:
+        return CodeId(code.co_filename, code.co_firstlineno, code.co_name)
+
+
+@dataclasses.dataclass
+class CodeState:
+    automatic_dynamic: DefaultDict[str, FrameStateSizeEntry] = dataclasses.field(
+        default_factory=lambda: defaultdict(FrameStateSizeEntry)
+    )
+
+
+CODE_STATE: DefaultDict[CodeId, CodeState] = defaultdict(CodeState)
+
 
 @dataclasses.dataclass(frozen=True)
 class InferStride:
@@ -162,8 +194,10 @@ def update_automatic_dynamic(
     *,
     is_unspecialized_nn_module: bool = False,
 ) -> FrameStateSizeEntry:
-    is_update = name in tx.output.frame_state
-    mut_entry = tx.output.frame_state.setdefault(name, FrameStateSizeEntry())
+    code_id = CodeId.make(tx.f_code)
+    frame_state = CODE_STATE[code_id]
+    is_update = name in frame_state.automatic_dynamic
+    mut_entry = frame_state.automatic_dynamic[name]
     old_entry = copy.copy(mut_entry)
     mut_entry |= entry
 

From 4681539f421b6e3d1e493d91faea2e768781054c Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@fb.com>
Date: Fri, 25 Oct 2024 14:46:27 -0700
Subject: [PATCH 132/161] [inductor] force strides for efficient attn bwd
 (#138879)

Try to fix https://github.com/pytorch/pytorch/issues/138772 .

aten._scaled_dot_product_efficient_attention_backward requires the out and gradient_out to have stride order (3, 1, 2, 0).  When Inductor layout optimization is enabled, Inductor may change tensor strides if they are not user visible. For efficient_attention_backward, Inductor tries to follow eager strides. But the eager strides Inductor gets for backward graph may be the one after optimization. There are a few possible fixes:
1. change the kernel to allow stride order other than  (3, 1, 2, 0). This is probably hard
2. backout https://github.com/pytorch/pytorch/pull/112045/files and don't do layout optimization if the model contains efficient_attention.
3. Force (3, 1, 2, 0) strides order for the relevant tensors
4. Pass original eager layouts to Inductor for the backward graph. Let Inductor follow those layouts for tensors with extra layout requirement.

The PR implements option 3. Option 4 looks more general to me, I think we can do this in long term.

I tried to add a test but failed to repro: https://gist.github.com/shunting314/fe37a246aad269de9ea00199446688f6

Here is the original command to repro the issue:
```
TORCHINDUCTOR_LAYOUT_OPTIMIZATION=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 CUDA_LAUNCH_BLOCKING=1 time python benchmark.py --model maxvit_nano_rw_256 --precision bfloat16 --torchcompile --bench train --no-retry -b 64
```
benchmark.py is https://github.com/huggingface/pytorch-image-models/blob/main/benchmark.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138879
Approved by: https://github.com/drisspg, https://github.com/eellison
---
 torch/_inductor/lowering.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 6062cbcbb9fd5..b706694911a5a 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -2308,7 +2308,7 @@ def apply_constraint(arg, fx_arg):
 def sdpa_constraint(fx_node, *args, **kwargs):
     # sdpa requires dense last dimension]
 
-    def apply_constraint(arg, fx_arg):
+    def apply_constraint(idx, arg, fx_arg):
         if not isinstance(arg, ir.IRNode):
             return arg
 
@@ -2316,10 +2316,23 @@ def apply_constraint(arg, fx_arg):
         meta_stride = meta_val.stride()
 
         stride_order = ir.get_stride_order(meta_stride)
+
         if stride_order and stride_order[-1] != 0:
             # contiguous stride order
             stride_order = list(reversed(range(len(arg.get_size()))))
 
+        if (
+            fx_node.target
+            == aten._scaled_dot_product_efficient_attention_backward.default
+            and idx in (0, 5)
+        ):
+            assert len(stride_order) == 4
+            # The 0 and 5th arguments for aten._scaled_dot_product_efficient_attention_backward.default
+            # are for out and gradient_out. They have to be in
+            # (3, 1, 2, 0) stride order. Otherwise the kernel will crash.
+            # Check https://github.com/pytorch/pytorch/issues/138772
+            stride_order = (3, 1, 2, 0)
+
         if not meta_val.is_cuda:
             return ir.ExternKernel.require_stride_order(arg, stride_order)
 
@@ -2362,9 +2375,10 @@ def is_aligned(x):
         return ir.ExternKernel.require_stride_order(arg, stride_order)
 
     args = tuple(
-        apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
+        apply_constraint(idx, arg, fx_arg)
+        for idx, (arg, fx_arg) in enumerate(zip(args, fx_node.args))
     )
-    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+    kwargs = {k: apply_constraint(-1, v, fx_node.kwargs[k]) for k, v in kwargs.items()}
     return args, kwargs
 
 

From 1152726febfb034b12235fd044345a417a0f1607 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Sat, 26 Oct 2024 13:32:40 -0700
Subject: [PATCH 133/161] [PGNCCL] Use recursive mutex in NCCLComm (#138997)

Fixes #138995: [PGNCCL][BUG] mutex acquired in recursive way may deadlock

The fix: use `std::recursive_mutex` to replace `std::mutex`.

Found and proposed by @dsjohns2. Thanks!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138997
Approved by: https://github.com/dsjohns2
---
 torch/csrc/distributed/c10d/NCCLUtils.cpp |  2 +-
 torch/csrc/distributed/c10d/NCCLUtils.hpp | 23 +++++++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index a86039c6ef4d4..00bd235c86666 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -16,7 +16,7 @@
 namespace c10d {
 
 ncclComm_t NCCLComm::getNcclComm() {
-  std::unique_lock<std::mutex> lock(mutex_);
+  LockType lock(mutex_);
   if (aborted_) {
     auto commFailureMsg = commFailureReason_ != std::nullopt
         ? c10::str(" Original reason for failure was: ", *commFailureReason_)
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index af32ab83ef57c..0089d453bb85a 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -275,6 +275,9 @@ class TORCH_API DebugInfoWriter {
 
 // RAII wrapper for NCCL communicator
 class NCCLComm {
+  using MutexType = std::recursive_mutex;
+  using LockType = std::unique_lock<MutexType>;
+
  public:
   explicit NCCLComm(ncclComm_t ncclComm) : ncclComm_(ncclComm) {}
 
@@ -283,7 +286,7 @@ class NCCLComm {
   ~NCCLComm() noexcept {
     // Add lock in this destructor, as aborted_ needs to be read after memory
     // barrier here.
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
     if (ncclComm_ && initialized_ && !aborted_) {
 #ifdef ENABLE_NCCL_ERROR_CHECKING
       // Use ncclCommAbort instead of ncclCommDestroy here since
@@ -371,7 +374,7 @@ class NCCLComm {
   NCCLComm(NCCLComm&& other) {
     // Using other's lock, as it reads other's states
     // Can not use this.mutex_, as this object is being constructed.
-    std::unique_lock<std::mutex> lock(other.mutex_);
+    LockType lock(other.mutex_);
     std::swap(ncclComm_, other.ncclComm_);
     std::swap(aborted_, other.aborted_);
     std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
@@ -382,13 +385,13 @@ class NCCLComm {
   ncclComm_t getNcclComm();
 
   std::optional<std::string> getNcclCommFailureReason() const {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
     return commFailureReason_;
   }
 
   void ncclCommAbort(
       std::optional<std::string> commFailureReason = std::nullopt) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
 #ifdef ENABLE_NCCL_ERROR_CHECKING
     if (aborted_ && !initialized_) {
       // Should not abort twice.
@@ -436,12 +439,12 @@ class NCCLComm {
   }
 
   bool isInitialized() const {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
     return initialized_;
   }
 
   bool isAborted() const {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
     return aborted_;
   }
 
@@ -450,7 +453,7 @@ class NCCLComm {
   }
 
   ncclResult_t checkForNcclError() {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
 #ifdef ENABLE_NCCL_ERROR_CHECKING
     if (ncclAsyncErr_ != ncclSuccess) {
       return ncclAsyncErr_;
@@ -465,7 +468,7 @@ class NCCLComm {
   }
 
   ncclResult_t registerSegment(void* ptr, size_t size) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
 #ifdef NCCL_HAS_COMM_REGISTER
     // We register only segments from cache allocator
     // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always
@@ -498,7 +501,7 @@ class NCCLComm {
   }
 
   ncclResult_t deregisterSegment(void* ptr) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    LockType lock(mutex_);
 #ifdef NCCL_HAS_COMM_REGISTER
     TORCH_CHECK(
         registeredSegmentHandles_.count(ptr) == 1,
@@ -538,7 +541,7 @@ class NCCLComm {
   bool aborted_{false};
   uint64_t ncclCommSplitCounter_{0};
   ncclResult_t ncclAsyncErr_{ncclSuccess};
-  mutable std::mutex mutex_;
+  mutable MutexType mutex_;
   // Rank that this communicator corresponds to.
   int rank_{};
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for

From 40c098f731aaf2e8dfebb10abd1c6bb7830bd98b Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Sun, 27 Oct 2024 00:37:46 +0000
Subject: [PATCH 134/161] Introduce a device-agnostic runtime API design
 (#132204)

# Motivation
According to [[RFC]A device-agnostic Python runtime API design for stream-based accelerators](https://github.com/pytorch/pytorch/issues/128403), this PR intends to introduce a device-agnostic runtime API design.
I personally prefer the **Simple Version** APIs that no longer accept the device type as an input argument. It means we will leverage `getAccelerator` to fetch the current accelerator. And it is flexible to expand these APIs to handle multiple types of accelerator scenarios. The design does **NOT** break the previous design philosophies.
I also believe that namespace torch.accelerator is better. It lets users know that the APIs they are calling are running on an accelerator rather than CPU. This is important. Meanwhile, we can follow a simple API design principle:
1. Device-agnostic APIs should be placed under the torch.accelerator namespace and not accept a device_type optional parameter.
2. Device-specific APIs should be placed under device-specific submodules.
3. APIS required by both CPU and accelerators should be placed under the torch namespace and accept a device_type optional parameter.

Also, I list the pros and cons of **Simple Version** here:
Pros:
- `torch.accelerator.foo` will have the same input argument as `torch.xxx.foo`, bringing a better user experience;
- more concise, facilitate the developer to write a device-agnostic code.

Cons:
- no obvious drawbacks.

# Additional Context
I list the new APIs here:
```python
torch.accelerator.is_available() -> bool:
torch.accelerator.current_accelerator() -> torch.device:
torch.accelerator.device_count() -> int:
torch.accelerator.current_device_idx() -> int:
torch.accelerator.set_device_idx(device: Union[torch.device, str, int, None]) -> None:
torch.accelerator.current_stream(device: Union[torch.device, str, int, None]) -> torch.Stream:
torch.accelerator.set_stream(stream: torch.Stream) -> None:
torch.accelerator.synchronize(device: Union[torch.device, str, int, None]) -> None:
```
According to the discussion with Alban, we decide to change the API name `set_device` to `set_device_idx` and `current_device` to `current_device_idx` for more explicit. And will submit other PR to support device and stream context manager.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/132204
Approved by: https://github.com/EikanWang, https://github.com/abhilash1910, https://github.com/gujinghui, https://github.com/albanD
---
 .../hip/impl/HIPGuardImplMasqueradingAsCUDA.h |   9 ++
 aten/src/ATen/mps/MPSGuardImpl.h              |   2 +
 aten/src/ATen/mps/MPSGuardImpl.mm             |   4 +
 build_variables.bzl                           |   1 +
 c10/core/impl/DeviceGuardImplInterface.h      |   9 ++
 c10/core/impl/VirtualGuardImpl.h              |   4 +
 c10/cuda/impl/CUDAGuardImpl.h                 |  13 ++
 c10/xpu/impl/XPUGuardImpl.h                   |   8 +
 docs/source/accelerator.rst                   |  17 ++
 docs/source/index.rst                         |   1 +
 torch/_C/__init__.pyi.in                      |   9 ++
 torch/__init__.py                             |   1 +
 torch/accelerator/__init__.py                 | 145 ++++++++++++++++++
 torch/accelerator/_utils.py                   |  28 ++++
 torch/csrc/DeviceAccelerator.cpp              |  82 ++++++++++
 torch/csrc/DeviceAccelerator.h                |   8 +
 torch/csrc/Module.cpp                         |   2 +
 17 files changed, 343 insertions(+)
 create mode 100644 docs/source/accelerator.rst
 create mode 100644 torch/accelerator/__init__.py
 create mode 100644 torch/accelerator/_utils.py
 create mode 100644 torch/csrc/DeviceAccelerator.cpp
 create mode 100644 torch/csrc/DeviceAccelerator.h

diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
index 6be1aed915e47..4ec944034be4b 100644
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -216,6 +216,15 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI
     C10_HIP_CHECK(hipEventSynchronize(hip_event));
   }
 
+  // Note: synchronizeDevice can be safely called from any device
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    int orig_device{-1};
+    C10_HIP_CHECK(hipGetDevice(&orig_device));
+    C10_HIP_CHECK(hipSetDevice(device_index));
+    C10_HIP_CHECK(hipDeviceSynchronize());
+    C10_HIP_CHECK(hipSetDevice(orig_device));
+  }
+
   void recordDataPtrOnStream(
     const c10::DataPtr& data_ptr,
     const Stream& stream) const override {
diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h
index cb50df2faeaee..6132cd8055e19 100644
--- a/aten/src/ATen/mps/MPSGuardImpl.h
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@@ -111,6 +111,8 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
 
   bool queryEvent(void* event) const override;
 
+  void synchronizeDevice(const DeviceIndex device_index) const override;
+
 };
 
 /// A variant of OptionalDeviceGuard that is specialized for MPS.
diff --git a/aten/src/ATen/mps/MPSGuardImpl.mm b/aten/src/ATen/mps/MPSGuardImpl.mm
index f832516c5da1b..a3dea4cd7c4d2 100644
--- a/aten/src/ATen/mps/MPSGuardImpl.mm
+++ b/aten/src/ATen/mps/MPSGuardImpl.mm
@@ -42,4 +42,8 @@
   return mps_event->query();
 }
 
+void MPSGuardImpl::synchronizeDevice(const DeviceIndex device_index) const {
+  at::mps::getDefaultMPSStream()->synchronize(SyncType::COMMIT_AND_WAIT);
+}
+
 } // namespace at::mps
diff --git a/build_variables.bzl b/build_variables.bzl
index 56f7bb6cf5aa1..36154e93e5040 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -795,6 +795,7 @@ libtorch_python_xpu_sources = [
 
 libtorch_python_core_sources = [
     "torch/csrc/DataLoader.cpp",
+    "torch/csrc/DeviceAccelerator.cpp",
     "torch/csrc/Device.cpp",
     "torch/csrc/Dtype.cpp",
     "torch/csrc/DynamicTypes.cpp",
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index a9b9b1219dfed..f145db0d234dd 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -212,6 +212,15 @@ struct C10_API DeviceGuardImplInterface {
     TORCH_CHECK(false, "Backend doesn't support synchronizing events.");
   }
 
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the device has been completed.
+   */
+  virtual void synchronizeDevice(const DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(
+        false, "Backend doesn't support synchronizing all streams on device.");
+  }
+
   /**
    * Ensure the caching allocator (if any) is aware that the given DataPtr is
    * being used on the given stream, and that it should thus avoid recycling the
diff --git a/c10/core/impl/VirtualGuardImpl.h b/c10/core/impl/VirtualGuardImpl.h
index 1d26eef0c9e17..cdee2aa1a644a 100644
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@@ -96,6 +96,10 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
     return impl_->synchronizeEvent(event);
   }
 
+  void synchronizeDevice(const DeviceIndex device_index) const override {
+    return impl_->synchronizeDevice(device_index);
+  }
+
  private:
   const DeviceGuardImplInterface* impl_ = nullptr;
 };
diff --git a/c10/cuda/impl/CUDAGuardImpl.h b/c10/cuda/impl/CUDAGuardImpl.h
index 1ef2fcb2c08f4..dd81dcf51fda1 100644
--- a/c10/cuda/impl/CUDAGuardImpl.h
+++ b/c10/cuda/impl/CUDAGuardImpl.h
@@ -219,6 +219,19 @@ struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     C10_CUDA_CHECK(cudaEventSynchronize(cuda_event));
   }
 
+  // Note: synchronizeDevice can be safely called from any device
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
+    }
+    C10_CUDA_CHECK(cudaDeviceSynchronize());
+    C10_CUDA_CHECK(c10::cuda::SetDevice(orig_device));
+  }
+
   void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
       const override {
     CUDAStream cuda_stream{stream};
diff --git a/c10/xpu/impl/XPUGuardImpl.h b/c10/xpu/impl/XPUGuardImpl.h
index 6213eccd2b243..5cb60a6a85056 100644
--- a/c10/xpu/impl/XPUGuardImpl.h
+++ b/c10/xpu/impl/XPUGuardImpl.h
@@ -163,6 +163,14 @@ struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     xpu_event->wait_and_throw();
   }
 
+  void synchronizeDevice(const c10::DeviceIndex device_index) const override {
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_device_synchronization(c10::kXPU);
+    }
+    c10::xpu::syncStreamsOnDevice(device_index);
+  }
+
   void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
       const override {
     const XPUStream xpu_stream{stream};
diff --git a/docs/source/accelerator.rst b/docs/source/accelerator.rst
new file mode 100644
index 0000000000000..6e4d7a541eeb8
--- /dev/null
+++ b/docs/source/accelerator.rst
@@ -0,0 +1,17 @@
+torch.accelerator
+===================================
+.. automodule:: torch.accelerator
+.. currentmodule:: torch.accelerator
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    device_count
+    is_available
+    current_accelerator
+    set_device_idx
+    current_device_idx
+    set_stream
+    current_stream
+    synchronize
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 773e64204293b..61325ff0ba815 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -64,6 +64,7 @@ Features described in this documentation are classified by release status:
    torch.amp <amp>
    torch.autograd <autograd>
    torch.library <library>
+   accelerator
    cpu
    cuda
    torch.cuda.memory <torch_cuda_memory>
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 4fec36e8e6574..930e4be2420e1 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -2183,6 +2183,15 @@ def _set_worker_pids(
 def _remove_worker_pids(loader_id: _int) -> None: ...  # THPModule_removeWorkerPIDs
 def _error_if_any_worker_fails() -> None: ...  # THPModule_errorIfAnyWorkerFails
 
+# Defined in torch/csrc/DeviceAccelerator.cpp
+def _accelerator_getAccelerator() -> _device: ...
+def _accelerator_deviceCount() -> _int: ...
+def _accelerator_setDeviceIndex(device_index: _int) -> None: ...
+def _accelerator_getDeviceIndex() -> _int: ...
+def _accelerator_setStream(Stream) -> None: ...
+def _accelerator_getStream(device_index: _int) -> Stream: ...
+def _accelerator_synchronizeDevice(device_index: _int) -> None: ...
+
 # Defined in torch/csrc/jit/python/python_tracer.cpp
 class TracingState:
     def push_scope(self, scope_name: str) -> None: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 5ff3c610abff6..144af1f508eef 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -2092,6 +2092,7 @@ def _assert(condition, message):
     __config__ as __config__,
     __future__ as __future__,
     _awaits as _awaits,
+    accelerator as accelerator,
     autograd as autograd,
     backends as backends,
     cpu as cpu,
diff --git a/torch/accelerator/__init__.py b/torch/accelerator/__init__.py
new file mode 100644
index 0000000000000..f4d7593175baf
--- /dev/null
+++ b/torch/accelerator/__init__.py
@@ -0,0 +1,145 @@
+r"""
+This package introduces support for the current :ref:`accelerator<accelerators>` in python.
+"""
+
+import torch
+
+from ._utils import _device_t, _get_device_index
+
+
+def device_count() -> int:
+    r"""Return the number of current :ref:`accelerator<accelerators>` available.
+
+    Returns:
+        int: the number of the current :ref:`accelerator<accelerators>` available.
+            If there is no available accelerators, return 0.
+    """
+    return torch._C._accelerator_deviceCount()
+
+
+def is_available() -> bool:
+    r"""Check if there is an available :ref:`accelerator<accelerators>`.
+
+    Returns:
+        bool: A boolean indicating if there is an available :ref:`accelerator<accelerators>`.
+
+    Example::
+
+        >>> assert torch.accelerator.is_available() "No available accelerators detected."
+    """
+    return device_count() > 0
+
+
+def current_accelerator() -> torch.device:
+    r"""Return the device of the current :ref:`accelerator<accelerators>`.
+
+    Returns:
+        torch.device: return the current accelerator as :class:`torch.device`.
+
+    .. note:: The index of the returned :class:`torch.device` will be ``None``, please use
+        :func:`torch.accelerator.current_device_idx` to know the current index being used.
+        And ensure to use :func:`torch.accelerator.is_available` to check if there is an available
+        accelerator. If there is no available accelerator, this function will raise an exception.
+
+    Example::
+
+        >>> # xdoctest:
+        >>> if torch.accelerator.is_available():
+        >>>     current_device = torch.accelerator.current_accelerator()
+        >>> else:
+        >>>     current_device = torch.device("cpu")
+        >>> if current_device.type == 'cuda':
+        >>>     is_half_supported = torch.cuda.has_half
+        >>> elif current_device.type == 'xpu':
+        >>>     is_half_supported = torch.xpu.get_device_properties().has_fp16
+        >>> elif current_device.type == 'cpu':
+        >>>     is_half_supported = True
+    """
+    return torch._C._accelerator_getAccelerator()
+
+
+def current_device_idx() -> int:
+    r"""Return the index of a currently selected device for the current :ref:`accelerator<accelerators>`.
+
+    Returns:
+        int: the index of a currently selected device.
+    """
+    return torch._C._accelerator_getDeviceIndex()
+
+
+def set_device_idx(device: _device_t, /) -> None:
+    r"""Set the current device index to a given device.
+
+    Args:
+        device (:class:`torch.device`, str, int): a given device that must match the current
+            :ref:`accelerator<accelerators>` device type.
+
+    .. note:: This function is a no-op if this device index is negative.
+    """
+    device_index = _get_device_index(device)
+    torch._C._accelerator_setDeviceIndex(device_index)
+
+
+def current_stream(device: _device_t = None, /) -> torch.Stream:
+    r"""Return the currently selected stream for a given device.
+
+    Args:
+        device (:class:`torch.device`, str, int, optional): a given device that must match the current
+            :ref:`accelerator<accelerators>` device type. If not given,
+            use :func:`torch.accelerator.current_device_idx` by default.
+
+    Returns:
+        torch.Stream: the currently selected stream for a given device.
+    """
+    device_index = _get_device_index(device, True)
+    return torch._C._accelerator_getStream(device_index)
+
+
+def set_stream(stream: torch.Stream) -> None:
+    r"""Set the current stream to a given stream.
+
+    Args:
+        stream (torch.Stream): a given stream that must match the current :ref:`accelerator<accelerators>` device type.
+
+    .. note:: This function will set the current device index to the device index of the given stream.
+    """
+    torch._C._accelerator_setStream(stream)
+
+
+def synchronize(device: _device_t = None, /) -> None:
+    r"""Wait for all kernels in all streams on the given device to complete.
+
+    Args:
+        device (:class:`torch.device`, str, int, optional): device for which to synchronize. It must match
+            the current :ref:`accelerator<accelerators>` device type. If not given,
+            use :func:`torch.accelerator.current_device_idx` by default.
+
+    .. note:: This function is a no-op if the current :ref:`accelerator<accelerators>` is not initialized.
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> assert torch.accelerator.is_available() "No available accelerators detected."
+        >>> start_event = torch.Event(enable_timing=True)
+        >>> end_event = torch.Event(enable_timing=True)
+        >>> start_event.record()
+        >>> tensor = torch.randn(100, device=torch.accelerator.current_accelerator())
+        >>> sum = torch.sum(tensor)
+        >>> end_event.record()
+        >>> torch.accelerator.synchronize()
+        >>> elapsed_time_ms = start_event.elapsed_time(end_event)
+    """
+    device_index = _get_device_index(device, True)
+    torch._C._accelerator_synchronizeDevice(device_index)
+
+
+__all__ = [
+    "current_accelerator",
+    "current_device_idx",
+    "current_stream",
+    "device_count",
+    "is_available",
+    "set_device_idx",
+    "set_stream",
+    "synchronize",
+]
diff --git a/torch/accelerator/_utils.py b/torch/accelerator/_utils.py
new file mode 100644
index 0000000000000..abaa00c44b5bc
--- /dev/null
+++ b/torch/accelerator/_utils.py
@@ -0,0 +1,28 @@
+from typing import Optional, Union
+
+import torch
+from torch import device as _device
+
+
+_device_t = Union[_device, str, int, None]
+
+
+def _get_device_index(device: _device_t, optional: bool = False) -> int:
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    device_index: Optional[int] = None
+    if isinstance(device, torch.device):
+        if torch.accelerator.current_accelerator() != device.type:
+            raise ValueError(
+                f"{device.type} doesn't match the current accelerator {torch.accelerator.current_accelerator()}."
+            )
+        device_index = device.index
+    if device_index is None:
+        if not optional:
+            raise ValueError(
+                f"Expected a torch.device with a specified index or an integer, but got:{device}"
+            )
+        return torch.accelerator.current_device_idx()
+    return device_index
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
new file mode 100644
index 0000000000000..67bd30acbf40d
--- /dev/null
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -0,0 +1,82 @@
+#include <c10/core/DeviceGuard.h>
+#include <torch/csrc/DeviceAccelerator.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+
+namespace torch::accelerator {
+
+void initModule(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  m.def("_accelerator_getAccelerator", []() {
+    // If no accelerator is currently available, raise an exception.
+    return c10::Device(at::getAccelerator(true).value());
+  });
+
+  m.def("_accelerator_deviceCount", []() {
+    const auto device_type = at::getAccelerator(false);
+    if (!device_type.has_value()) {
+      return static_cast<c10::DeviceIndex>(0);
+    }
+    torch::utils::maybe_initialize_device(device_type.value());
+    c10::impl::VirtualGuardImpl impl(device_type.value());
+    return static_cast<c10::DeviceIndex>(impl.deviceCount());
+  });
+
+  m.def("_accelerator_setDeviceIndex", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::getAccelerator(true).value();
+    // If device index is negative, no-op
+    if (device_index < 0) {
+      return;
+    }
+    torch::utils::maybe_initialize_device(device_type);
+    c10::impl::VirtualGuardImpl impl(device_type);
+    impl.setDevice({device_type, device_index});
+  });
+
+  m.def("_accelerator_getDeviceIndex", []() {
+    const auto device_type = at::getAccelerator(true).value();
+    torch::utils::maybe_initialize_device(device_type);
+    c10::impl::VirtualGuardImpl impl(device_type);
+    return static_cast<c10::DeviceIndex>(impl.getDevice().index());
+  });
+
+  m.def("_accelerator_setStream", [](c10::Stream stream) {
+    const auto device_type = at::getAccelerator(true).value();
+    TORCH_CHECK(
+        device_type == stream.device_type(),
+        "stream's device type ",
+        c10::DeviceTypeName(stream.device_type()),
+        " doesn't match the current accelerator ",
+        c10::DeviceTypeName(device_type));
+    torch::utils::maybe_initialize_device(device_type);
+    c10::impl::VirtualGuardImpl impl(device_type);
+    // Set the current device to the device of stream
+    if (impl.getDevice().index() != stream.device_index()) {
+      impl.setDevice(stream.device());
+    }
+    impl.exchangeStream(stream);
+  });
+
+  m.def("_accelerator_getStream", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::getAccelerator(true).value();
+    torch::utils::maybe_initialize_device(device_type);
+    c10::impl::VirtualGuardImpl impl(device_type);
+    return impl.getStream({device_type, device_index});
+  });
+
+  m.def("_accelerator_synchronizeDevice", [](c10::DeviceIndex device_index) {
+    const auto device_type = at::getAccelerator(true).value();
+    if (!torch::utils::is_device_initialized(device_type)) {
+      return;
+    }
+    torch::utils::maybe_initialize_device(device_type);
+    c10::impl::VirtualGuardImpl impl(device_type);
+    // impl.synchronizeDevice should can be safely called from any device
+    {
+      py::gil_scoped_release no_gil;
+      impl.synchronizeDevice(device_index);
+    }
+  });
+}
+
+} // namespace torch::accelerator
diff --git a/torch/csrc/DeviceAccelerator.h b/torch/csrc/DeviceAccelerator.h
new file mode 100644
index 0000000000000..87b20e4576f4f
--- /dev/null
+++ b/torch/csrc/DeviceAccelerator.h
@@ -0,0 +1,8 @@
+#include <ATen/DeviceAccelerator.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::accelerator {
+
+void initModule(PyObject* module);
+
+} // namespace torch::accelerator
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 416e5b5d72b4c..e11294418a305 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -42,6 +42,7 @@
 #include <ATen/ThreadLocalPythonObjects.h>
 #include <torch/csrc/DataLoader.h>
 #include <torch/csrc/Device.h>
+#include <torch/csrc/DeviceAccelerator.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Event.h>
@@ -1733,6 +1734,7 @@ PyObject* initModule() {
 #endif
   torch::mtia::initModule(module);
   torch::cpu::initModule(module);
+  torch::accelerator::initModule(module);
   torch::instruction_counter::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));

From d9534a50a933f42d1cc126f73a52c025f497da3d Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@meta.com>
Date: Sun, 27 Oct 2024 14:14:27 +0000
Subject: [PATCH 135/161] [AOTI][refactor] Separate header codegen (#138882)

Summary: Move arrayref specific header codegen logic to cpp_wrapper_cpu_array_ref.py, and consolidate some header files codegen logic

Test Plan: CI

Differential Revision: D64899248

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138882
Approved by: https://github.com/hl475
---
 .../codegen/aoti_runtime/implementation.cpp   |  7 +-
 .../codegen/aoti_runtime/interface.cpp        |  5 +-
 torch/_inductor/codegen/cpp_wrapper_cpu.py    | 88 ++++++++-----------
 .../codegen/cpp_wrapper_cpu_array_ref.py      | 14 +++
 4 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/torch/_inductor/codegen/aoti_runtime/implementation.cpp b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
index 0273aa9aa8df0..017e7a104d5b0 100644
--- a/torch/_inductor/codegen/aoti_runtime/implementation.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/implementation.cpp
@@ -1,8 +1,11 @@
 // NOTE: Like interface.cpp, this file will be copied into AOTInductor
 // generated output. This file is intended to keep implementation
 // details separate from the implementation of the AOTI public
-// interface. Note also that #includes should go into interface.cpp
-// for simplicity of maintenance.
+// interface.
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
 
 namespace torch {
 namespace aot_inductor {
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
index db85f849e3b29..b270ccbeef945 100644
--- a/torch/_inductor/codegen/aoti_runtime/interface.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -1,8 +1,7 @@
-#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+// Definition of AOTI runtime interface functions
+
 #include <torch/csrc/inductor/aoti_runtime/interface.h>
 #include <torch/csrc/inductor/aoti_runtime/model_container.h>
-#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
-#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
 
 #include <iostream>
 #include <sstream>
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 891ed89ed8d66..0e51e9f7c2570 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -146,13 +146,16 @@ def write_header(self):
             return
 
         if V.graph.aot_mode:
-            for header_cpp_file in ("interface.cpp", "implementation.cpp"):
-                with open(
-                    os.path.join(
-                        os.path.dirname(__file__), "aoti_runtime", header_cpp_file
-                    )
-                ) as f:
-                    self.header.splice(f.read())
+            self.header.splice(
+                """
+                #include <torch/csrc/inductor/aoti_runtime/interface.h>
+                #include <torch/csrc/inductor/aoti_runtime/model.h>
+                """
+            )
+            with open(
+                os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp")
+            ) as f:
+                self.header.splice(f.read())
         else:
             self.header.splice(
                 """
@@ -161,44 +164,8 @@ def write_header(self):
 
                 cpp_wrapper_src = (
                 '''
-                """
-            )
-
-        self.header.splice(
-            f"#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>"
-        )
-        self.header.splice(
-            """
-            #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
-            #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
-            #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
-            """
-        )
-        if V.graph.aot_mode:
-            self.header.splice(
-                """
-                #include <torch/csrc/inductor/aoti_runtime/model.h>
-                """
-            )
-
-        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
-            "linux",
-            "win32",
-        ]
-        if config.profiler_mark_wrapper_call or enable_kernel_profile:
-            self.header.splice("#include <ATen/record_function.h>")
-
-        self.header.splice("typedef at::Half half;")
-        self.header.splice("typedef at::BFloat16 bfloat16;")
-        self.header.splice("#include <c10/util/generic_math.h>")
-
-        if not V.graph.aot_mode:
-            self.header.splice(
-                """
                 #include <pybind11/pybind11.h>
-
                 namespace py = pybind11;
-                using namespace torch::aot_inductor;
 
                 class RAIIPyObject {
                 public:
@@ -224,19 +191,40 @@ class RAIIPyObject {
                 private:
                     PyObject* obj_;
                 };
+
+                #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+                #include <torch/csrc/inductor/aoti_runtime/utils.h>
+                using namespace torch::aot_inductor;
                 """
             )
 
-        # Round up to the nearest multiple of ALIGN_BYTES
-        # ALIGN_BYTES must be a power of 2
         self.header.splice(
             f"""
+            #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+            #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+            #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+            #include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>
+
+            #include <c10/util/generic_math.h>
+            typedef at::Half half;
+            typedef at::BFloat16 bfloat16;
+
+            // Round up to the nearest multiple of {ALIGN_BYTES}
             [[maybe_unused]] static int64_t align(int64_t nbytes) {{
               return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES};
             }}
             """
         )
 
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        if config.profiler_mark_wrapper_call or enable_kernel_profile:
+            # No C shim for profiling APIs, assuming profiling is a debugging feature which
+            # does not provide any ABI compatibility promise.
+            self.header.splice("#include <ATen/record_function.h>")
+
     @functools.lru_cache(None)  # noqa: B019
     def include_extra_header(self, header: str):
         # This is needed for cpp to python dtype conversion
@@ -259,10 +247,8 @@ def write_prefix(self):
         if V.graph.is_const_graph:
             # We do not write prefix for constant graph, it will be written by main module.
             return
-
         if V.graph.aot_mode:
-            self.prefix.writeline("namespace torch {")
-            self.prefix.writeline("namespace aot_inductor {")
+            self.prefix.writeline("namespace torch::aot_inductor {")
 
     def write_input_output_info(
         self,
@@ -945,14 +931,14 @@ def generate_end(self, result):
             if V.graph.is_const_graph:
                 result.writeline("} // AOTInductorModel::_const_run_impl")
             else:
-                result.writeline("} // namespace aot_inductor")
-                result.writeline("} // namespace torch")
+                result.writeline("} // namespace torch::aot_inductor\n\n\n")
             return
 
         # cpp entry function for JIT with cpp wrapper
-        result.writeline("'''\n)")
         result.splice(
             f"""
+            '''
+            )
             inductor_entry = CppWrapperCodeCache.load_pybinding(
                 ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
             """
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
index 4cdff622dd646..14add6fef0895 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import os
 from itertools import count
 from typing import Dict, List, Optional, Tuple
 
@@ -97,6 +98,19 @@ def get_input_cpp_type(input):
             return DTYPE_TO_CPP[dtype]
         return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
 
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        super().write_header()
+        with open(
+            os.path.join(
+                os.path.dirname(__file__), "aoti_runtime", "implementation.cpp"
+            )
+        ) as f:
+            self.header.splice(f.read())
+
     def codegen_input_numel_asserts(self):
         for name, buf in V.graph.graph_inputs.items():
             if isinstance(buf, sympy.Expr):

From 5d074746e91205aa63fea07bb7482277dacc3b65 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sun, 27 Oct 2024 14:18:58 +0000
Subject: [PATCH 136/161] [BE]: Add better optional typing (#138426)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138426
Approved by: https://github.com/XuehaiPan, https://github.com/malfet
---
 benchmarks/functional_autograd_benchmark/utils.py |  6 ++++--
 test/dynamo/test_global.py                        |  8 +++++---
 test/dynamo/test_python_autograd.py               |  4 ++--
 test/fx/test_dce_pass.py                          |  4 ++--
 test/inductor/test_cutlass_backend.py             |  2 +-
 test/quantization/pt2e/test_representation.py     |  4 ++--
 test/test_cuda_sanitizer.py                       | 14 +++++++-------
 torch/_dynamo/variables/iter.py                   |  2 +-
 torch/overrides.py                                |  4 ++--
 torch/testing/_internal/optests/generate_tests.py |  2 +-
 10 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
index 87d676f4fb31c..e19570ffe3cb9 100644
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@@ -1,5 +1,5 @@
 from collections import defaultdict
-from typing import Callable, Dict, List, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import nn, Tensor
@@ -76,7 +76,9 @@ def load_weights(mod: nn.Module, names: List[str], params: Tuple[Tensor, ...]) -
 
 
 # Utilities to read/write markdown table-like content.
-def to_markdown_table(res: TimingResultType, header: Tuple[str, ...] = None) -> str:
+def to_markdown_table(
+    res: TimingResultType, header: Optional[Tuple[str, ...]] = None
+) -> str:
     if header is None:
         header = ("model", "task", "mean", "var")
     out = ""
diff --git a/test/dynamo/test_global.py b/test/dynamo/test_global.py
index a7dd07175996d..76476913ce626 100644
--- a/test/dynamo/test_global.py
+++ b/test/dynamo/test_global.py
@@ -1,4 +1,6 @@
 # Owner(s): ["module: dynamo"]
+from typing import Optional
+
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
@@ -183,7 +185,7 @@ def fn(x):
     def test_store_global_inline_1(self):
         # Borrowed from test_python_autograd.py
         class Variable:
-            def __init__(self, value: torch.Tensor, name: str = None):
+            def __init__(self, value: torch.Tensor, name: Optional[str] = None):
                 self.value = value
                 self.name = name or fresh_name()
 
@@ -203,12 +205,12 @@ def fn(a, b):
     def test_store_global_inline_2(self):
         # Borrowed from test_python_autograd.py
         class Variable:
-            def __init__(self, value: torch.Tensor, name: str = None):
+            def __init__(self, value: torch.Tensor, name: Optional[str] = None):
                 self.value = value
                 self.name = name or fresh_name()
 
             @staticmethod
-            def constant(value: torch.Tensor, name: str = None):
+            def constant(value: torch.Tensor, name: Optional[str] = None):
                 return Variable(value, name)
 
         def fn(a, b):
diff --git a/test/dynamo/test_python_autograd.py b/test/dynamo/test_python_autograd.py
index defc71a97afc2..e8c628fe33435 100644
--- a/test/dynamo/test_python_autograd.py
+++ b/test/dynamo/test_python_autograd.py
@@ -26,14 +26,14 @@ def fresh_name() -> str:
 
 
 class Variable:
-    def __init__(self, value: torch.Tensor, name: str = None):
+    def __init__(self, value: torch.Tensor, name: Optional[str] = None):
         self.value = value
         self.name = name or fresh_name()
 
     # We need to start with some tensors whose values were not computed
     # inside the autograd. This function constructs leaf nodes.
     @staticmethod
-    def constant(value: torch.Tensor, name: str = None):
+    def constant(value: torch.Tensor, name: Optional[str] = None):
         return Variable(value, name)
 
     def __repr__(self):
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index ab107716208dc..2e6821f920bc1 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -2,7 +2,7 @@
 
 import copy
 import unittest
-from typing import Set, Type
+from typing import Optional, Set, Type
 
 import torch
 import torch.fx
@@ -39,7 +39,7 @@ def _run_dce_and_test(
         self,
         m: torch.nn.Module,
         expect_dce_changes: bool,
-        modules_to_be_leafs: Set[Type] = None,
+        modules_to_be_leafs: Optional[Set[Type]] = None,
         custom: bool = False,
     ):
         class TestTracer(torch.fx.Tracer):
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index 0936b8af69476..617ac6f805d28 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -266,7 +266,7 @@ def _test_max_autotune_cutlass_backend_epilogue_fusion(
         mixed_precision=False,
         fp16=True,
         expected_fuse_count=0,
-        mm: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None,
+        mm: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
         batch_size: Optional[int] = None,
     ):
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
diff --git a/test/quantization/pt2e/test_representation.py b/test/quantization/pt2e/test_representation.py
index 90f591de27cbf..07aedfdffb9fa 100644
--- a/test/quantization/pt2e/test_representation.py
+++ b/test/quantization/pt2e/test_representation.py
@@ -1,6 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 import copy
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 from torch._higher_order_ops.out_dtype import out_dtype  # noqa: F401
@@ -28,7 +28,7 @@ def _test_representation(
         quantizer: Quantizer,
         ref_node_occurrence: Dict[ns, int],
         non_ref_node_occurrence: Dict[ns, int],
-        fixed_output_tol: float = None,
+        fixed_output_tol: Optional[float] = None,
         output_scale_idx: int = 2,
     ) -> torch.nn.Module:
         # resetting dynamo cache
diff --git a/test/test_cuda_sanitizer.py b/test/test_cuda_sanitizer.py
index d4b65a1541e97..daf2cfda3dcb7 100644
--- a/test/test_cuda_sanitizer.py
+++ b/test/test_cuda_sanitizer.py
@@ -3,7 +3,7 @@
 import sys
 import textwrap
 import traceback
-from typing import List
+from typing import List, Optional
 
 import torch
 import torch.cuda._sanitizer as csan
@@ -149,8 +149,8 @@ def setUp(self):
     def kernel_launch(
         self,
         stream: StreamId,
-        read_only: List[DataPtr] = None,
-        read_write: List[DataPtr] = None,
+        read_only: Optional[List[DataPtr]] = None,
+        read_write: Optional[List[DataPtr]] = None,
     ) -> List[csan.SynchronizationError]:
         if read_only is None:
             read_only = []
@@ -168,8 +168,8 @@ def kernel_launch(
     def assert_good_kernel_launch(
         self,
         stream: StreamId,
-        read_only: List[DataPtr] = None,
-        read_write: List[DataPtr] = None,
+        read_only: Optional[List[DataPtr]] = None,
+        read_write: Optional[List[DataPtr]] = None,
     ) -> None:
         self.assertEqual(self.kernel_launch(stream, read_only, read_write), [])
 
@@ -177,8 +177,8 @@ def assert_bad_kernel_launch(
         self,
         number_of_errors: int,
         stream: StreamId,
-        read_only: List[DataPtr] = None,
-        read_write: List[DataPtr] = None,
+        read_only: Optional[List[DataPtr]] = None,
+        read_write: Optional[List[DataPtr]] = None,
     ) -> None:
         errors = self.kernel_launch(stream, read_only, read_write)
         self.assertEqual(len(errors), number_of_errors)
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index aee2d89488bd7..bb9faef582ae7 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -275,7 +275,7 @@ class CycleIteratorVariable(IteratorVariable):
     def __init__(
         self,
         iterator: IteratorVariable,
-        saved: List[VariableTracker] = None,
+        saved: Optional[List[VariableTracker]] = None,
         saved_index: int = 0,
         item: Optional[VariableTracker] = None,
         **kwargs,
diff --git a/torch/overrides.py b/torch/overrides.py
index 7040ecc522539..b6e38fe966726 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -28,7 +28,7 @@
 import types
 import warnings
 from functools import wraps
-from typing import Any, Callable, Dict, Iterable, List, Set, Tuple, Type
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Type
 
 import torch
 from torch._C import (
@@ -1589,7 +1589,7 @@ def wrapped(*args, **kwargs):
 
 def _get_overloaded_args(
     relevant_args: Iterable[Any],
-    get_type_fn: Callable[[Any], Type] = None,
+    get_type_fn: Optional[Callable[[Any], Type]] = None,
 ) -> List[Any]:
     """Returns a list of arguments on which to call __torch_function__.
 
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 183968f697ab9..7820fed19ccc3 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -157,7 +157,7 @@ def generate_opcheck_tests(
     testcase: Any,
     namespaces: List[str],
     failures_dict_path: Optional[str] = None,
-    additional_decorators: Dict[str, Callable] = None,
+    additional_decorators: Optional[Dict[str, Callable]] = None,
     test_utils: List[str] = DEFAULT_TEST_UTILS,
 ) -> None:
     """Given an existing TestCase, use the existing tests to generate

From d969b34377530cf600521c118c8093b3727d8536 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 27 Oct 2024 15:36:46 +0000
Subject: [PATCH 137/161] Revert "In Inductor, be willing to generate deferred
 runtime asserts when unbacked (#138804)"

This reverts commit f1a677cba5ef7514f2cf303753d3117528867a33.

Reverted https://github.com/pytorch/pytorch/pull/138804 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but it seems to fail pr_time_benchmarks job in trunk ([comment](https://github.com/pytorch/pytorch/pull/138804#issuecomment-2440069407))
---
 test/inductor/test_aot_inductor.py       | 90 ------------------------
 torch/_inductor/sizevars.py              | 27 ++-----
 torch/fx/experimental/symbolic_shapes.py |  7 +-
 3 files changed, 9 insertions(+), 115 deletions(-)

diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 3f346a2098140..14c48a4e7e94e 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -15,7 +15,6 @@
 import torch._inductor
 import torch._inductor.config
 import torch.nn as nn
-import torch.nn.functional as F
 from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import rand_strided, same
 from torch._dynamo.utils import counters
@@ -3649,95 +3648,6 @@ def forward(self, x):
         example_inputs = (torch.randn(8, device=self.device),)
         self.check_model(Model(), example_inputs)
 
-    def test_tile_positional_embedding(self):
-        class TilePositionalEmbedding(nn.Module):
-            """
-            Positional embedding for tiles, different for every tile, same for every token within a tile.
-            Notice that tile is different from patch (token). For details, please check the documentation of
-            :class:`torchtune.modules.vision_transformer.VisionTransformer`.
-            Args:
-                max_num_tiles (int): The maximum number of tiles an image can be divided into.
-                embed_dim (int): The dimensionality of each tile embedding.
-            """
-
-            def __init__(
-                self,
-                max_num_tiles: int,
-                embed_dim: int,
-            ):
-                super().__init__()
-                self.max_num_tiles = max_num_tiles
-                self.embed_dim = embed_dim
-
-                scale = embed_dim**-0.5
-                self.embedding = nn.Parameter(
-                    scale * torch.randn(max_num_tiles, max_num_tiles, 1, embed_dim)
-                )
-                self.gate = nn.Parameter(torch.zeros(1))
-
-            def forward(
-                self, x: torch.Tensor, aspect_ratio: torch.Tensor
-            ) -> torch.Tensor:
-                """
-                args:
-                    x (torch.Tensor): torch.Tensor with shape (bsz * n_imgs, n_tiles, n_tokens, embed_dim).
-                    aspect_ratio (torch.Tensor): torch.Tensor with shape (bsz * n_imgs, 2),
-                        representing the aspect ratio of the image before tile-cropping, e.g. (2,1).
-                returns:
-                    torch.Tensor: The input tensor with added positional embeddings.
-                """
-                bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape
-                torch._check(n_tiles <= self.max_num_tiles)
-
-                for batch_idx, (n_tiles_h, n_tiles_w) in enumerate(aspect_ratio):
-                    # When we batch images, all are padded to the same amount of tiles.
-                    # The aspect_ratio lets us know the non padded tiles for each image.
-                    # We only add positional encoding to those.
-                    n_tiles_h = n_tiles_h.item()
-                    n_tiles_w = n_tiles_w.item()
-
-                    n_non_padded_tiles = int(n_tiles_h * n_tiles_w)
-
-                    # We get only the positional encoding for non padded tiles,
-                    # i.e. n_tiles_h, n_tiles_w.
-                    torch._check_is_size(n_tiles_h)
-                    torch._check_is_size(n_tiles_w)
-                    torch._check(n_tiles_h > 0)
-                    torch._check(n_tiles_w > 0)
-                    torch._check(n_tiles_h <= self.max_num_tiles)
-                    torch._check(n_tiles_w <= self.max_num_tiles)
-                    padded_embedding = F.pad(self.embedding, (0, 0, 0, 0, 0, 1, 0, 1))
-                    # pos_embed = padded_embedding[:n_tiles_h, :n_tiles_w, :, :]
-                    pos_embed = padded_embedding.narrow(0, 0, n_tiles_h).narrow(
-                        1, 0, n_tiles_w
-                    )
-
-                    # Add pos encoding to the non padded tiles.
-                    pos_embed = pos_embed.clone()
-                    pos_embed = pos_embed.view(n_non_padded_tiles, 1, self.embed_dim)
-
-                    x = F.pad(x, (0, 0, 0, 0, 0, 1, 0, 0))
-                    torch._check_is_size(n_non_padded_tiles)
-                    torch._check(n_non_padded_tiles < x.size(1))
-                    # x[batch_idx, :n_non_padded_tiles, :, :] += pos_embed
-                    updating = x.narrow(0, batch_idx, batch_idx + 1).narrow(
-                        1, 0, n_non_padded_tiles
-                    )
-                    # updating += pos_embed * self.gate.tanh()
-                    updating.add_(pos_embed * self.gate.tanh())
-                    # x = x[:, :n_tiles, :, :]
-                    x = x.narrow(1, 0, n_tiles)
-
-                return x
-
-        x = torch.ones(1, 4, 1600, 1280, device=self.device)
-        aspect_ratio = torch.tensor([[2, 2]], device=self.device)
-
-        self.check_model(
-            TilePositionalEmbedding(4, 1280),
-            (x, aspect_ratio),
-        )
-
     @dynamo_config.patch({"capture_scalar_outputs": True})
     def test_sym_i64_input_codegen(self):
         if self.device != "cuda":
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 8775036cf1059..44fe34895a8cd 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -247,11 +247,9 @@ def _simplify_loops_impl(
             # for which "strides" don't make sense so we ignore them here.
             # NOTE: These expressions may still block merging dims in the sound
             # substitution test performed in can_merge_dims.
-            (
-                self.stride_vars(x, index_vars)
-                if isinstance(x, sympy.Expr)
-                else [0] * len(index_vars)
-            )
+            self.stride_vars(x, index_vars)
+            if isinstance(x, sympy.Expr)
+            else [0] * len(index_vars)
             for x in index_formulas
         ]
         assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
@@ -417,29 +415,14 @@ def guard_equals(self, left: Expr, right: Expr) -> Expr:
             left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         if isinstance(right, Expr):
             right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
-
-        expr = sympy.Eq(left, right)
-        static_expr = self.shape_env._maybe_evaluate_static(expr)
-
-        if static_expr is not None:
-            assert bool(static_expr)
-            return left
-
-        assert self.shape_env.defer_runtime_assert(expr, "guard_equals")
+        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
         return left
 
     def guard_leq(self, left: Expr, right: Expr) -> None:
         return self.guard_lt(left, right + 1)
 
     def guard_lt(self, left: Expr, right: Expr) -> None:
-        expr = sympy.Lt(left, right)
-        static_expr = self.shape_env._maybe_evaluate_static(expr)
-
-        if static_expr is not None:
-            assert bool(static_expr)
-            return
-
-        assert self.shape_env.defer_runtime_assert(expr, "guard_lt")
+        assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
 
     def guarded_order(self, seq):
         """
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 83c651e29c585..d5503ba25acb3 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -6289,7 +6289,6 @@ def cleanup(self) -> None:
             for ra in ras:
                 ra.stack.cleanup()
 
-    @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
     def defer_runtime_assert(
         self, orig_expr: SympyBoolean, msg: str, fx_node: Optional[torch.fx.Node] = None
@@ -6327,6 +6326,7 @@ def defer_runtime_assert(
         # NB: Don't use new_expr as expr; it could contain gunk like shape0
         # which we don't want to guard on
 
+        # OK, we're definitely doing a runtime assert now
         if (
             self._translation_validation_enabled
             and fx_node is not None
@@ -6340,9 +6340,10 @@ def defer_runtime_assert(
         if not self._suppress_guards_tls():
             # If you're here because of this assert, read Note [Backwards runtime asserts]
             # in torch/_inductor/graph.py
-            if self.runtime_asserts_frozen:
-                log.warning("runtime_asserts_frozen but then got %s", expr)
+            assert not self.runtime_asserts_frozen, expr
+
             self._check_frozen(expr, sympy.true)
+
             # eliminate symbols on equality tests / refine ranges
             if isinstance(expr, sympy.Rel):
                 self._maybe_guard_rel(expr)

From 144d75d9341d1391bdd9c6158c1ad94223bb4365 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 27 Oct 2024 15:39:33 +0000
Subject: [PATCH 138/161] Revert "[PGNCCL] Use non-blocking mode by default in
 eager init (#138527)"

This reverts commit 07e30eae2a8241e531890b6c9a33ab5a80c5ccaf.

Reverted https://github.com/pytorch/pytorch/pull/138527 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but it is failing on ROCm ([comment](https://github.com/pytorch/pytorch/pull/138527#issuecomment-2440070035))
---
 test/distributed/test_c10d_nccl.py            | 64 ++++++++++---------
 torch/csrc/cuda/nccl.cpp                      |  7 +-
 torch/csrc/distributed/c10d/NCCLUtils.cpp     | 12 +++-
 torch/csrc/distributed/c10d/NCCLUtils.hpp     | 37 +++++------
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 48 ++------------
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  8 ---
 6 files changed, 70 insertions(+), 106 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 64a210ed3b6c0..6d81901a7a66c 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -321,30 +321,25 @@ def abortpg():
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("eager_init", [True, False])
-    def test_close_pg(self, eager_init: bool):
+    def test_close_pg(self):
         # Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
         # abort the process group.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
 
         store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
-        c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-            device_id=device if eager_init else None,
-        )
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
 
         t = torch.rand(10, 10, device=device)
         # First allreduce to initialize state.
-        dist.all_reduce(t)
+        pg.allreduce(t)
 
         # Destroy pg and validate pg is no longer valid
         dist.destroy_process_group()
-        with self.assertRaises(ValueError):
-            dist.all_reduce(t)
+        with self.assertRaises(dist.DistBackendError):
+            pg.allreduce([t])
+
+        del pg
 
     CUDA_12_AND_ABOVE = torch.cuda.is_available() and (
         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
@@ -808,24 +803,27 @@ def test_extend_nccl_pg_timeout(self, backend):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    @parametrize("eager_init", [True, False])
-    def test_new_group(self, eager_init: bool):
+    def test_comm_lazy_init_split(self):
         # Test the optimization of new groups that contain all world
         # ranks use the "transparent" `ncclCommSplit` optimization.
         store = c10d.FileStore(self.file_name, self.world_size)
-        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
-        c10d.init_process_group(
-            "nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-            device_id=device if eager_init else None,
-        )
-        ng = c10d.new_group()
-        tensor = torch.tensor([self.rank], device=device)
-        dist.broadcast(tensor, 0)
-        dist.broadcast(tensor, 0, group=ng)
-        dist.destroy_process_group()
+        pg = self._create_process_group_nccl(store, self.opts())
+
+        # Test lazy splitting behavior across each per-device backend.
+        for device in self.rank_to_GPU[self.rank]:
+            backend = pg._get_backend(torch.device(device))
+
+            # split doesn't happen unless the original process group has lazily
+            # created communicators, so first verify we haven't split even when
+            # making the new group and running an operation on the original pg.
+            ng = c10d.new_group()
+            tensor = torch.tensor([self.rank]).cuda(device)
+            pg.broadcast(tensor, 0)
+            self.assertEqual(backend.comm_split_count(), 0)
+
+            # The new group will not force a split because it is a lazy init.
+            ng.broadcast(tensor, 0)
+            self.assertEqual(backend.comm_split_count(), 0)
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -865,11 +863,15 @@ def test_comm_eager_init_subgroup(self):
         pg = self._create_process_group_nccl(store, self.opts())
         backend = pg._get_backend(torch.device(device))
         self.assertEqual(backend._is_initialized(), False)
-        # create a subgroup eagerly
-        new_group = c10d.new_group([0, 1], device_id=device)
+
         tensor = torch.full((1,), self.rank).cuda(device)
+        new_group = c10d.new_group([0, 1], device_id=device)
+        self.assertEqual(backend.comm_split_count(), 0)
+
+        new_backend = new_group._get_backend(torch.device(device))
+        self.assertEqual(new_backend._is_initialized(), True)
         dist.broadcast(tensor, 0, group=new_group)
-        # the default group should stay lazy
+        self.assertEqual(new_backend.comm_split_count(), 0)
         self.assertEqual(backend._is_initialized(), False)
         torch.cuda.synchronize()
         dist.destroy_process_group()
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 7be7b08efc6a6..a426d9043fa66 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -159,6 +159,7 @@ static inline void NCCL_CHECK(ncclResult_t result) {
 }
 
 // TODO(eqy): can this duplication be avoided from NCCLUtils.cpp?
+// Default value: on
 bool nccl_use_nonblocking() {
   static bool nccl_use_nonblocking_ =
       c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
@@ -193,8 +194,7 @@ static inline void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
                            currentTimepoint - startTimepoint)
                            .count();
     if (timeElapsed > nccl_nonblocking_timeout()) {
-      throw std::runtime_error(
-          "NCCL timeout when waiting for nonblocking call to become successful.");
+      throw std::runtime_error("NCCL timeout.");
     }
     sched_yield(); // yield to other threads
     ncclCommGetAsyncError(to_nccl_comm(comm), &result);
@@ -226,8 +226,7 @@ static inline void NCCL_CHECK_TIMEOUT(
                                currentTimepoint - startTimepoint)
                                .count();
         if (timeElapsed > nccl_nonblocking_timeout()) {
-          throw std::runtime_error(
-              "NCCL timeout when waiting for nonblocking call to become successful.");
+          throw std::runtime_error("NCCL timeout.");
         }
         sched_yield(); // yield to other threads
         ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index 00bd235c86666..e5fb9abacdb88 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -31,7 +31,7 @@ ncclComm_t NCCLComm::getNcclComm() {
             commFailureMsg));
   }
   // In non-blocking mode, ensure comm is ready.
-  if (nonBlocking_) {
+  if (nccl_use_nonblocking()) {
     // If timeout is reached, throw an exception.
     C10D_NCCL_CHECK_TIMEOUT_SLEEP(ncclInProgress, ncclComm_, std::nullopt);
     // ncclComm_ should be initialized by now
@@ -101,7 +101,6 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 #endif
   ++source->ncclCommSplitCounter_;
   comm->rank_ = rank;
-  comm->nonBlocking_ = config.blocking == 0;
   LOG(INFO) << "Rank " << source->rank_ << ": created child comm "
             << comm->repr() << " with color_id " << color_id;
   return comm;
@@ -164,6 +163,15 @@ size_t hashTensors(const std::vector<at::Tensor>& tensors) {
 }
 #endif
 
+bool nccl_use_nonblocking() {
+  static bool nccl_use_nonblocking_ =
+      c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
+  if (nccl_use_nonblocking_) {
+    TORCH_WARN_ONCE("Using experimental non-blocking NCCL communicator.");
+  }
+  return nccl_use_nonblocking_;
+}
+
 // Default value: 30 minutes
 int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 0089d453bb85a..27b8b8f8e9547 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -236,6 +236,7 @@ DEFINE_CONSTANT(started_state, "started");
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
 TORCH_API std::string getNcclVersion();
 TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
+bool nccl_use_nonblocking();
 int nccl_nonblocking_timeout();
 
 // Provides additional detail into NCCL error codes based on when these are
@@ -310,8 +311,6 @@ class NCCLComm {
     comm->ncclId_ = commId;
     comm->rank_ = rank;
     comm->initialized_ = true;
-    // Old style comm is always blocking.
-    comm->nonBlocking_ = false;
     return comm;
   }
 
@@ -322,19 +321,26 @@ class NCCLComm {
       ncclUniqueId commId,
       ncclConfig_t& config) {
     auto comm = std::make_shared<NCCLComm>();
-    comm->nonBlocking_ = config.blocking == 0;
-    LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
-              << (comm->nonBlocking_ ? "nonblocking" : "blocking");
-    C10D_NCCL_CHECK_NONBLOCKING(
-        ncclCommInitRankConfig(
-            &(comm->ncclComm_), numRanks, commId, rank, &config),
-        std::nullopt);
+    bool isInitialized = false;
+    if (nccl_use_nonblocking()) {
+      config.blocking = 0;
+      LOG(INFO) << "Rank " << rank
+                << ": creating NCCL communicator in nonblocking mode";
+      C10D_NCCL_CHECK_NONBLOCKING(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          std::nullopt);
+    } else {
+      C10D_NCCL_CHECK(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          std::nullopt);
+      // under blocking mode, comm is initialized after NCCL CHECK
+      isInitialized = true;
+    }
     comm->ncclId_ = commId;
     comm->rank_ = rank;
-    // Under blocking mode, comm is initialized immediately after NCCL init
-    // returns; Under nonblocking mode, we check whether comm is initialized the
-    // *next* time ncclComm_ is accessed.
-    comm->initialized_ = !comm->nonBlocking_;
+    comm->initialized_ = isInitialized;
     return comm;
   }
 
@@ -379,7 +385,6 @@ class NCCLComm {
     std::swap(aborted_, other.aborted_);
     std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
     std::swap(initialized_, other.initialized_);
-    std::swap(nonBlocking_, other.nonBlocking_);
   }
 
   ncclComm_t getNcclComm();
@@ -548,10 +553,6 @@ class NCCLComm {
   // better error messaging.
   std::optional<std::string> commFailureReason_{};
   bool initialized_{false};
-  // Whether this communicator is using nonblocking mode. Recorded during comm
-  // creation or split. For safety, we give a default value of true (more
-  // protection).
-  bool nonBlocking_{true};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
   std::unordered_map<void*, void*> registeredSegmentHandles_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index c9564a31f057c..6206b4d6c5994 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -987,6 +987,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
             << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
             << ", TORCH_DISTRIBUTED_DEBUG: " << torch_distributed_debug
+            << ", TORCH_NCCL_USE_COMM_NONBLOCKING: " << nccl_use_nonblocking()
 #ifdef NCCL_HAS_COMM_REGISTER
             << ", TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: "
             << useTensorRegisterAllocatorHook_
@@ -1058,39 +1059,6 @@ void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
   getNCCLComm(key, device, OpType::ALLREDUCE);
 }
 
-bool ProcessGroupNCCL::useNonblocking() {
-#ifndef NCCL_HAS_COMM_NONBLOCKING
-  return false;
-#endif
-  // Already parsed, return the cached value
-  if (useNonblocking_.has_value()) {
-    return useNonblocking_.value();
-  }
-  // Get environment variable.
-  auto nbEnv = c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING");
-
-  // 1st priority: Respect the user's setting
-  if (options_->config.blocking != NCCL_CONFIG_UNDEF_INT) {
-    useNonblocking_ = options_->config.blocking == 0;
-  }
-  // 2nd priority: Respect the environment variable
-  else if (nbEnv.has_value()) {
-    useNonblocking_ = nbEnv.value();
-  }
-  // 3rd priority: automatically use nonblocking if we are in eager init mode
-  else if (getBoundDeviceId()) {
-    useNonblocking_ = true;
-  }
-  // 4th priority: otherwise, nonblocking = false to preserve old behavior
-  else {
-    useNonblocking_ = false;
-  }
-
-  LOG(INFO) << logPrefix()
-            << "Using non-blocking mode: " << useNonblocking_.value();
-  return useNonblocking_.value();
-}
-
 void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   // If our backend doesn't support splitting, this is a no-op for
   // ranks not in the new subgroup (and ranks that would be in it will
@@ -1099,8 +1067,6 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   const auto key = getKeyFromDevice(device);
   LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
             << device << ", key " << key << ", i am " << this;
-  bool useNb = useNonblocking();
-  options_->config.blocking = useNb ? 0 : 1;
   auto comm = getNCCLComm(key, device, OpType::ALLREDUCE);
   NCCLComm::split(
       comm.get(),
@@ -2391,11 +2357,6 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
     rank = p2pRank;
   }
 
-#ifdef NCCL_HAS_COMM_NONBLOCKING
-  bool useNb = useNonblocking();
-  options_->config.blocking = useNb ? 0 : 1;
-#endif
-
 #ifdef NCCL_HAS_COMM_SPLIT
   if (options_->split_from) {
     // Find a valid, healthy communicator to split from if possible.
@@ -2812,7 +2773,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     work->ncclStartEvent_->record(ncclStream);
   }
 
-  if (useNonblocking()) {
+  if (nccl_use_nonblocking()) {
     groupEndNonblocking(comm);
   } else {
     groupEnd();
@@ -3132,7 +3093,8 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 #endif
 
   {
-    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(comm, useNonblocking());
+    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(
+        comm, nccl_use_nonblocking());
     for (const auto i : c10::irange(inputs.size())) {
       // Both `inputs' and `outputs' are created on a worker stream and used in
       // different ncclStreams.  Hence, both must record the ncclStream to
@@ -4700,7 +4662,7 @@ void ProcessGroupNCCL::groupEndNonblocking(
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
 #else
-  if (!useNonblocking()) {
+  if (!nccl_use_nonblocking()) {
     C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
   } else {
     C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comm, std::nullopt);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 839463a9d8be1..5ec9ae32405f6 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -778,10 +778,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Abort all communicators on this rank.
   bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
 
-  // A helper function to check if nonblocking API mode should be used.
-  // Use this helper instead of directly checking `useNonblocking_` variable.
-  bool useNonblocking();
-
  private:
   int globalRankStart;
   int globalRankStride;
@@ -1241,10 +1237,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   std::shared_ptr<ProcessGroupStatus> pgStatus_ =
       std::make_shared<ProcessGroupStatus>();
-
-  // Internal cached value: use NCCL non-blocking API mode or not.
-  // Use `useNonblocking()` method instead of accessing this variable directly.
-  std::optional<bool> useNonblocking_{std::nullopt};
 };
 
 // Dumps the NCCL comm traces and additional information about the Process

From bae3426af77be643af83f1527fb430e9ca09b058 Mon Sep 17 00:00:00 2001
From: Wouter Devriendt <wouterdevriendt@meta.com>
Date: Sun, 27 Oct 2024 16:31:34 +0000
Subject: [PATCH 139/161] reimport pr137735 due to merging check issues
 (#138959)

This is  a cherry-pick from #137735 by @mikaylagawarecki , that cannot be merged due to a (wrongly) failing check for codev

@diff-train-skip-merge

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138959
Approved by: https://github.com/mikaylagawarecki
---
 build.bzl                            |  2 +-
 caffe2/serialize/inline_container.cc | 15 +++++---
 caffe2/serialize/inline_container.h  |  5 +--
 docs/source/notes/serialization.rst  |  2 ++
 test/test_serialization.py           | 29 +++++++++++++++
 third_party/miniz-2.1.0/miniz.c      |  3 +-
 third_party/miniz-2.1.0/miniz.h      |  3 +-
 torch/_C/__init__.pyi.in             |  4 +--
 torch/csrc/jit/python/init.cpp       | 54 ++++++++++++++++------------
 torch/serialization.py               | 38 ++++++++++++++++++--
 10 files changed, 119 insertions(+), 36 deletions(-)

diff --git a/build.bzl b/build.bzl
index dbb1866ac5482..3ba83e4578caa 100644
--- a/build.bzl
+++ b/build.bzl
@@ -36,7 +36,7 @@ def define_targets(rules):
             "caffe2/serialize/istream_adapter.cc",
             "caffe2/serialize/read_adapter_interface.cc",
         ],
-        copts = ["-fexceptions"],
+        copts = ["-fexceptions", "-DFBCODE_CAFFE2"],
         tags = [
             "-fbcode",
             "supermodule:android/default/pytorch",
diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 2761147cf333d..70c13791da688 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -621,15 +621,17 @@ size_t ostream_write_func(
   return ret;
 }
 
-PyTorchStreamWriter::PyTorchStreamWriter(const std::string& file_name)
-    : archive_name_(basename(file_name)) {
+PyTorchStreamWriter::PyTorchStreamWriter(const std::string& file_name, bool compute_crc32)
+    : archive_name_(basename(file_name)),
+      compute_crc32_(compute_crc32) {
   setup(file_name);
 }
 
 PyTorchStreamWriter::PyTorchStreamWriter(
-    const std::function<size_t(const void*, size_t)> writer_func)
+    const std::function<size_t(const void*, size_t)> writer_func, bool compute_crc32)
     : archive_name_("archive"),
-      writer_func_(writer_func) {
+      writer_func_(writer_func),
+      compute_crc32_(compute_crc32) {
   setup(archive_name_);
 }
 
@@ -695,6 +697,11 @@ void PyTorchStreamWriter::writeRecord(
   size_t padding_size =
       detail::getPadding(ar_->m_archive_size, full_name.size(), size, padding_);
   uint32_t flags = compress ? MZ_BEST_COMPRESSION : 0;
+  if (!compute_crc32_) {
+#if (!defined(FBCODE_CAFFE2))
+    flags |= MZ_ZIP_FLAG_DO_NOT_COMPUTE_CRC32;
+#endif
+  }
   mz_zip_writer_add_mem_ex_v2(
       /*pZip=*/ar_.get(),
       /*pArchive_name=*/full_name.c_str(),
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index 6a13d414feb9e..55a723f3b8912 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -205,9 +205,9 @@ class TORCH_API PyTorchStreamReader final {
 
 class TORCH_API PyTorchStreamWriter final {
  public:
-  explicit PyTorchStreamWriter(const std::string& archive_name);
+  explicit PyTorchStreamWriter(const std::string& archive_name, bool compute_crc32 = true);
   explicit PyTorchStreamWriter(
-      const std::function<size_t(const void*, size_t)> writer_func);
+      const std::function<size_t(const void*, size_t)> writer_func, bool compute_crc32 = true);
 
   void setMinVersion(const uint64_t version);
 
@@ -248,6 +248,7 @@ class TORCH_API PyTorchStreamWriter final {
   std::function<size_t(const void*, size_t)> writer_func_;
   uint64_t combined_uncomp_crc32_ = 0;
   std::string serialization_id_;
+  bool compute_crc32_;
 
   // This number will be updated when the model has operators
   // that have valid upgraders.
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index c05dc028a471c..255fa2dbaa577 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -390,6 +390,8 @@ The following utility functions are related to serialization:
 .. currentmodule:: torch.serialization
 
 .. autofunction:: register_package
+.. autofunction:: get_crc32_options
+.. autofunction:: set_crc32_options
 .. autofunction:: get_default_load_endianness
 .. autofunction:: set_default_load_endianness
 .. autofunction:: get_default_mmap_options
diff --git a/test/test_serialization.py b/test/test_serialization.py
index a58e47c083176..59d6e21bd3a19 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -4334,6 +4334,35 @@ def test_weights_only_env_variables(self, force_weights_only):
                 else:
                     os.environ[env_var] = old_value
 
+    @unittest.skipIf(IS_FBCODE, "miniz version differs between fbcode and oss")
+    @parametrize("compute_crc32", (True, False))
+    @parametrize("filename", (True, False))
+    def test_crc32_options(self, compute_crc32, filename):
+        # test both path and buffer case
+        file_creation_func = TemporaryFileName if filename else tempfile.NamedTemporaryFile
+        sd = torch.nn.Linear(3, 5).state_dict()
+        with file_creation_func() as f:
+            try:
+                torch.serialization.set_crc32_options(compute_crc32)
+                torch.save(sd, f)
+                if not filename:
+                    f.seek(0)
+                sd_loaded = torch.load(f, weights_only=True)
+                self.assertEqual(sd_loaded, sd)
+            finally:
+                torch.serialization.set_crc32_options(True)
+
+            args = () if compute_crc32 else (zipfile.BadZipFile, "Bad CRC-32 for file")
+            ctx = contextlib.nullcontext if compute_crc32 else self.assertRaisesRegex
+
+            if not filename:
+                f.seek(0)
+            # zip_file.extractall() will raise BadZipFile if CRC32 is not populated
+            # we use the context manager to check whether CRC32 was populated
+            with ctx(*args), tempfile.TemporaryDirectory() as temp_dir:
+                with zipfile.ZipFile(f) as zip_file:
+                    zip_file.extractall(path=temp_dir)
+
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super().run(*args, **kwargs)
diff --git a/third_party/miniz-2.1.0/miniz.c b/third_party/miniz-2.1.0/miniz.c
index dc790d9e36b7c..043a11b1d45f1 100755
--- a/third_party/miniz-2.1.0/miniz.c
+++ b/third_party/miniz-2.1.0/miniz.c
@@ -6251,6 +6251,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
     mz_uint8 extra_data[MZ_ZIP64_MAX_CENTRAL_EXTRA_FIELD_SIZE];
     mz_uint16 bit_flags = 0;
     mz_bool write_metadata_only = buf_size && !pBuf;
+    mz_bool skip_crc32 = write_metadata_only || (level_and_flags & MZ_ZIP_FLAG_DO_NOT_COMPUTE_CRC32);
 
     if ((int)level_and_flags < 0)
         level_and_flags = MZ_DEFAULT_LEVEL;
@@ -6309,7 +6310,7 @@ mz_bool mz_zip_writer_add_mem_ex_v2(mz_zip_archive *pZip, const char *pArchive_n
 
 	if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
 	{
-        if (!write_metadata_only) {
+        if (!skip_crc32) {
             uncomp_crc32 = (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
         }
 		uncomp_size = buf_size;
diff --git a/third_party/miniz-2.1.0/miniz.h b/third_party/miniz-2.1.0/miniz.h
index 2cad1370c6388..0d5e73071f82a 100755
--- a/third_party/miniz-2.1.0/miniz.h
+++ b/third_party/miniz-2.1.0/miniz.h
@@ -1001,7 +1001,8 @@ typedef enum {
     MZ_ZIP_FLAG_VALIDATE_HEADERS_ONLY = 0x2000,     /* validate the local headers, but don't decompress the entire file and check the crc32 */
     MZ_ZIP_FLAG_WRITE_ZIP64 = 0x4000,               /* always use the zip64 file format, instead of the original zip file format with automatic switch to zip64. Use as flags parameter with mz_zip_writer_init*_v2 */
     MZ_ZIP_FLAG_WRITE_ALLOW_READING = 0x8000,
-    MZ_ZIP_FLAG_ASCII_FILENAME = 0x10000
+    MZ_ZIP_FLAG_ASCII_FILENAME = 0x10000,
+    MZ_ZIP_FLAG_DO_NOT_COMPUTE_CRC32 = 0x20000, /* don't compute the crc32 of file data that's being added. */
 } mz_zip_flags;
 
 typedef enum {
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 930e4be2420e1..f54f2617f5488 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1447,9 +1447,9 @@ class PyTorchFileReader:
 
 class PyTorchFileWriter:
     @overload
-    def __init__(self, name: str) -> None: ...
+    def __init__(self, name: str, compute_crc32 = True) -> None: ...
     @overload
-    def __init__(self, buffer: BinaryIO) -> None: ...
+    def __init__(self, buffer: BinaryIO, compute_crc32 = True) -> None: ...
     def write_record(self, name: str, data: Union[Storage, bytes, _int], size: _int) -> None: ...
     def write_end_of_file(self) -> None: ...
     def set_min_version(self, version: _int) -> None: ...
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 4ac84dedb544c..588c13c21bb58 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1389,28 +1389,38 @@ void initJITBindings(PyObject* module) {
           "fallback", [](GraphExecutorState& s) { return s.fallback; });
 
   py::class_<PyTorchStreamWriter>(m, "PyTorchFileWriter")
-      .def(py::init<std::string>())
-      .def(py::init([](const py::object& buffer) {
-        auto writer_func = [=](const void* data, size_t size) {
-          // Writing an empty file is a noop
-          if (size == 0) {
-            return size;
-          }
-          py::gil_scoped_acquire acquire;
-          if (!data) {
-            // See [Note: write_record_metadata]
-            buffer.attr("seek")(
-                size, py::module::import("os").attr("SEEK_CUR"));
-          } else {
-            auto memory_view = py::memoryview::from_memory(
-                reinterpret_cast<const char*>(data), size);
-            buffer.attr("write")(std::move(memory_view));
-          }
-          return size;
-        };
-        return std::make_unique<PyTorchStreamWriter>(std::move(writer_func));
-      }))
-      .def(py::init<const std::function<size_t(const void*, size_t)>&>())
+      .def(
+          py::init<std::string, bool>(),
+          py::arg("file_name"),
+          py::arg("compute_crc32") = true)
+      .def(
+          py::init([](const py::object& buffer, bool compute_crc32 = true) {
+            auto writer_func = [=](const void* data, size_t size) {
+              // Writing an empty file is a noop
+              if (size == 0) {
+                return size;
+              }
+              py::gil_scoped_acquire acquire;
+              if (!data) {
+                // See [Note: write_record_metadata]
+                buffer.attr("seek")(
+                    size, py::module::import("os").attr("SEEK_CUR"));
+              } else {
+                auto memory_view = py::memoryview::from_memory(
+                    reinterpret_cast<const char*>(data), size);
+                buffer.attr("write")(std::move(memory_view));
+              }
+              return size;
+            };
+            return std::make_unique<PyTorchStreamWriter>(
+                std::move(writer_func), compute_crc32);
+          }),
+          py::arg("buffer"),
+          py::arg("compute_crc32") = true)
+      .def(
+          py::init<const std::function<size_t(const void*, size_t)>&, bool>(),
+          py::arg("writer_func"),
+          py::arg("compute_crc32") = true)
       // [Note: write_record_metadata]
       // The write_record_metadata function is intended to write metadata (i.e.
       // the zipfile header and end of central directory record) for a file
diff --git a/torch/serialization.py b/torch/serialization.py
index 17517db6e7fd1..a87230e824aab 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -53,6 +53,8 @@
     "load",
     "StorageType",
     "LoadEndianness",
+    "get_crc32_options",
+    "set_crc32_options",
     "get_default_load_endianness",
     "set_default_load_endianness",
     "get_default_mmap_options",
@@ -167,6 +169,34 @@ def set_default_load_endianness(endianness):
     _default_load_endian = endianness
 
 
+_compute_crc32: bool = True
+
+
+def get_crc32_options() -> bool:
+    """
+    Get whether :func:`torch.save` computes and writes crc32 for each record.
+
+    Defaults to ``True``.
+    """
+    return _compute_crc32
+
+
+def set_crc32_options(compute_crc32: bool):
+    """
+    Set whether :func:`torch.save` computes and writes crc32 for each record.
+
+    .. note::
+        Setting this to ``False`` may make unzipping of the ``torch.save`` output
+        fail or warn due to corrupted CRC32. However ``torch.load`` will be
+        able to load the file.
+
+    Args:
+        compute_crc32 (bool): set crc32 compuation flag
+    """
+    global _compute_crc32
+    _compute_crc32 = compute_crc32
+
+
 _default_mmap_options: int = MAP_PRIVATE
 
 
@@ -682,9 +712,11 @@ def __init__(self, name) -> None:
             # For filenames with non-ascii characters, we rely on Python
             # for writing out the file.
             self.file_stream = io.FileIO(self.name, mode="w")
-            super().__init__(torch._C.PyTorchFileWriter(self.file_stream))
+            super().__init__(
+                torch._C.PyTorchFileWriter(self.file_stream, _compute_crc32)
+            )
         else:
-            super().__init__(torch._C.PyTorchFileWriter(self.name))
+            super().__init__(torch._C.PyTorchFileWriter(self.name, _compute_crc32))
 
     def __exit__(self, *args) -> None:
         self.file_like.write_end_of_file()
@@ -700,7 +732,7 @@ def __init__(self, buffer) -> None:
                 raise AttributeError(msg)
             raise TypeError(msg)
         self.buffer = buffer
-        super().__init__(torch._C.PyTorchFileWriter(buffer))
+        super().__init__(torch._C.PyTorchFileWriter(buffer, _compute_crc32))
 
     def __exit__(self, *args) -> None:
         self.file_like.write_end_of_file()

From 3217ae20827202c73feddaf8b5e7f7688a602a9a Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@meta.com>
Date: Sat, 26 Oct 2024 14:08:56 -0700
Subject: [PATCH 140/161] [inductor] Only apply score_fusion_memory_threshold
 to horizontal fusions (#138970)

PR #136782 made `x.sum()+1` become two kernels, which hurts compile
times as @ezyang noticed and breaks a lot of the tests in this stack.  This reworks that heuristic to not apply as often.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138970
Approved by: https://github.com/shunting314
---
 torch/_inductor/scheduler.py | 41 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index ce70857b9d13c..3c4ebb0af5089 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -2854,9 +2854,9 @@ def decide_fusion_fail_reason(
 
         return str(reasons)
 
-    def has_shared_data_after_reordering_loop(
+    def shared_data_after_reordering_loop(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
-    ) -> bool:
+    ) -> int:
         """
         Right now just greedily reorder the loop of node1 to be compatible with node2,
         but ideally we should have some heuristics to reorder the loop for node2
@@ -2868,14 +2868,14 @@ def has_shared_data_after_reordering_loop(
         if not config.loop_ordering_after_fusion or any(
             n.get_device().type == "cpu" for n in [node1, node2]
         ):
-            return False
+            return 0
 
         node1_buffer_names = node1.read_writes.buffer_names()
         node2_buffer_names = node2.read_writes.buffer_names()
         # Fast path: no common buffers.
         common_buffer_names = node1_buffer_names & node2_buffer_names
         if not common_buffer_names:
-            return False
+            return 0
 
         node1_name2dep = {dep.name: dep for dep in node1.read_writes.reads_and_writes()}
         node2_name2dep = {dep.name: dep for dep in node2.read_writes.reads_and_writes()}
@@ -2898,7 +2898,7 @@ def has_shared_data_after_reordering_loop(
                 )
 
         if len(candidates) == 0:
-            return False
+            return 0
 
         # Pick the largest buffer to guide the loop reordering
         numel, lhs_dep, rhs_dep = max(candidates, key=lambda x: x[0])
@@ -2908,7 +2908,9 @@ def has_shared_data_after_reordering_loop(
             # We can not do loop reordering in this case right now
             # Simply returning true if the two Deps are the same after
             # normalization (merging loops)
-            return lhs_dep.normalize() == rhs_dep.normalize()
+            if lhs_dep.normalize() == rhs_dep.normalize():
+                return self.dep_size_hint(lhs_dep)
+            return 0
 
         # Only reorder loops for pointwise for now
         if not node1.is_reduction():
@@ -2922,10 +2924,7 @@ def has_shared_data_after_reordering_loop(
                 node2.get_name(),
             )
 
-        return (
-            self.score_fusion_memory(node1, node2)
-            >= config.score_fusion_memory_threshold
-        )
+        return self.score_fusion_memory(node1, node2)
 
     def unfusable_node(self, node: BaseSchedulerNode) -> bool:
         """
@@ -2993,22 +2992,17 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
             return False
         del device2
 
-        no_shared_data = (
-            self.score_fusion_memory(node1, node2)
-            < config.score_fusion_memory_threshold
-        )
-        if no_shared_data:
-            no_shared_data = not self.has_shared_data_after_reordering_loop(
-                node1, node2
-            )
+        shared_data_score = self.score_fusion_memory(node1, node2)
+        if shared_data_score == 0:
+            shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
 
         loop_ordering_log.debug(
             "%s and %s has%s shared data",
             node1.get_name(),
             node2.get_name(),
-            " no" if no_shared_data else "",
+            " no" if shared_data_score == 0 else "",
         )
-        if no_shared_data and (
+        if shared_data_score == 0 and (
             not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
         ):
             if is_metric_table_enabled("fusion_failure_due_to_indexing_mismatch"):
@@ -3050,6 +3044,13 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
                 return False
             return self.get_backend(device).can_fuse_vertical(node1, node2)
         else:  # nodes don't depend on each other, but may have common reads
+            if (
+                # only apply score_fusion_memory_threshold to horizontal fusions
+                shared_data_score
+                < config.score_fusion_memory_threshold
+            ):
+                why("score_fusion_memory_threshold")
+                return False
             if self.can_fusion_increase_peak_memory(node1, node2):
                 why("will increase peak memory")
                 return False

From fed37dbfbceefe306af648ff4fe1e0124c4d7844 Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@meta.com>
Date: Sat, 26 Oct 2024 14:08:56 -0700
Subject: [PATCH 141/161] [inductor] Cooperative reductions (#137756)

Example generated code for `(x+y).sum()`:
```py
@triton.jit
def triton_unk_fused_add_sum_0(in_ptr0, in_ptr1, out_ptr0, ws_ptr, semaphores_ptr, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr, RSPLIT : tl.constexpr):
    xnumel = 1
    rnumel = 1048576
    rsplit_id = tl.program_id(0)
    num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
    rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
    rsplit_start = rsplit_chunk * rsplit_id
    rsplit_end = rsplit_chunk * (rsplit_id + 1)
    xoffset = tl.program_id(1) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
    rbase = tl.arange(0, RBLOCK)[None, :]
    _tmp4 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
    for roffset in range(rsplit_start, rsplit_end, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r0 = rindex
        tmp0 = tl.load(in_ptr0 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp1 = tl.load(in_ptr1 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp2 = tmp0 + tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, RBLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(rmask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    if RSPLIT > 1:
        tmp4_ws = (ws_ptr + 0).to(tl.pointer_type(tl.float32))
        tl.store(tmp4_ws + (xindex * RSPLIT + rsplit_id), tmp4, None)
    if RSPLIT > 1:
        triton_helpers.gpu_barrier(semaphores_ptr + (2 * tl.program_id(1) + 0), RSPLIT, True)
    if RSPLIT > 1:
        tmp4_peers = tl.load(tmp4_ws + (xindex * RSPLIT + tl.arange(0, RSPLIT)[None,:]), None, eviction_policy='evict_first')
        tmp4 = tl.sum(tmp4_peers, 1)[:, None]
    if rsplit_id == (0 % RSPLIT):
        tl.store(out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp4, None)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137756
Approved by: https://github.com/eellison
ghstack dependencies: #138970
---
 test/inductor/test_cooperative_reductions.py  | 116 ++++++
 test/inductor/test_perf.py                    |   3 +
 test/inductor/test_torchinductor.py           |  44 ++-
 .../test_torchinductor_strided_blocks.py      |   5 +
 torch/_inductor/codegen/common.py             |   2 +-
 torch/_inductor/codegen/halide.py             |   2 +-
 torch/_inductor/codegen/simd.py               |  19 +-
 torch/_inductor/codegen/triton.py             | 350 ++++++++++++++++--
 torch/_inductor/codegen/triton_split_scan.py  |   3 +
 torch/_inductor/codegen/wrapper.py            |  14 +-
 torch/_inductor/config.py                     |  16 +-
 torch/_inductor/graph.py                      |   3 +-
 torch/_inductor/ir.py                         |  14 +-
 torch/_inductor/runtime/hints.py              |   1 +
 torch/_inductor/runtime/triton_helpers.py     |  33 ++
 torch/_inductor/runtime/triton_heuristics.py  |  71 +++-
 16 files changed, 620 insertions(+), 76 deletions(-)
 create mode 100644 test/inductor/test_cooperative_reductions.py

diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
new file mode 100644
index 0000000000000..204a653ed56b7
--- /dev/null
+++ b/test/inductor/test_cooperative_reductions.py
@@ -0,0 +1,116 @@
+# Owner(s): ["module: inductor"]
+import torch
+import torch._inductor
+from torch._inductor import config
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+@config.patch(
+    {
+        "triton.cooperative_reductions": True,
+        "triton.force_cooperative_reductions": True,
+    }
+)
+@instantiate_parametrized_tests
+class CooperativeReductionTests(TestCase):
+    def setUp(self):
+        super().setUp()
+        torch._inductor.metrics.generated_kernel_count = 0
+        torch._dynamo.reset()
+
+    def run_and_check(self, fn, args, *, expect_kernel_count=1):
+        expected = fn(*args)
+        fn = torch.compile(fn, fullgraph=True)
+        result, (source_code,) = run_and_get_code(fn, *args)
+        self.assertEqual(result, expected)
+        self.assertIn("@triton_heuristics.cooperative_reduction", source_code)
+        self.assertEqual(
+            torch._inductor.metrics.generated_kernel_count, expect_kernel_count
+        )
+        return source_code
+
+    @parametrize(
+        "name",
+        [
+            "sum",
+            "mean",
+            "prod",
+            "amin",
+            "amax",
+            "min",
+            "max",
+            "var_mean",
+            "std",
+            "softmax",
+        ],
+    )
+    @parametrize("dtype", [torch.float16, torch.float32, torch.float64])
+    def test_reduction_fns(self, name, dtype):
+        def fn(x, y):
+            return reduction_fn(x + y, dim=-1)
+
+        reduction_fn = getattr(torch, name)
+        args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
+        self.run_and_check(fn, args)
+
+    def test_bool_reduction_fns(self):
+        def fn(x, y):
+            return [
+                torch.any(x == y),
+                torch.all(x == y),
+                torch.any(x != y),
+                torch.all(x != y),
+                torch.any(x < y),
+                torch.all(x > y),
+            ]
+
+        args = [torch.randn(1024, device="cuda") for _ in range(2)]
+        source_code = self.run_and_check(fn, args)
+        before, after = source_code.split("triton_helpers.x_grid_barrier")
+        self.assertEqual(before.count("if rsplit_id == ("), 0)
+        self.assertEqual(after.count("if rsplit_id == ("), 6)
+
+    @parametrize("bs", [1, 2, 5, 15])
+    @parametrize("count", [1024**2 + 1, 1024**2 - 1, 1024])
+    def test_non_power_of_2(self, bs, count):
+        def fn(x):
+            return x.mean(), x.std() + x.min()
+
+        args = [torch.randn([bs, count], device="cuda")]
+        self.run_and_check(fn, args)
+
+    def test_chained_reductions(self):
+        def fn(x):
+            for _ in range(8):
+                x = x + torch.softmax(x, 1)
+            return x
+
+        args = [torch.randn(4, 100000, device="cuda")]
+        source_code = self.run_and_check(fn, args)
+        self.assertEqual(source_code.count("triton_helpers.x_grid_barrier"), 16)
+        self.assertEqual(source_code.count("empty_strided_cuda"), 8)
+
+    def test_reduce_split(self):
+        def fn(a, b):
+            a1 = torch.linalg.vector_norm(a)
+            b1 = torch.sum(b, dim=0)
+            return a1, b1
+
+        inps = [
+            torch.rand(2048, 512, device="cuda"),
+            torch.rand(20, 20, device="cuda"),
+        ]
+        self.run_and_check(fn, inps, expect_kernel_count=2)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    if HAS_CUDA:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 87d8e383bd58a..7d9ec01e7a3d0 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -501,6 +501,9 @@ def f(x, scale, amax_keep_dim):
         expected_numel = (
             1 + hidden_size * 2 + 4 * 2048 * hidden_size * 2 + 4 * 2048 * 2 + 1
         )
+        if config.triton.cooperative_reductions:
+            expected_numel = 134225922
+
         self.assertExpectedInline(count_numel(f, *inp, True), str(expected_numel))
         self.assertExpectedInline(count_numel(f, *inp, False), str(expected_numel))
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index b35ca9a645fa0..001db08500bec 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -11747,30 +11747,34 @@ def fn(a: torch.Tensor) -> torch.Tensor:
                 return torch.sum(a)
 
             kernels = self.get_kernels(fn, [torch.randn([256, 256], device=GPU_TYPE)])
+            expected_divisible = {
+                # kernel0 reduces from 256 to (xnumel=8, rnumel=8192), which means it reduces 256 by 256 into an array of
+                # size 8 by accumulating 8192 elements at once note that rnumel is equal to 512 * 16, so rnumel which is
+                # at slot 3 should be in the divisible by 16 descriptor
+                0: (0, 1, 3),
+                # kernel1 reduces from 8 elements to a single scalar.
+                # Since multi-kernel generate 2 variants for each kernel. The second
+                # persistent-reduction has index 2.
+                1: (0, 1),
+            }
             if config.triton.multi_kernel:
-                self.assertTrue(
-                    len(kernels) == 4,
-                    "SUM should result in four kernels when multi-kernel is enabled",
-                )
+                self.assertEqual(len(kernels), 4)
+                expected_divisible[2] = expected_divisible.pop(1)
+            elif config.triton.cooperative_reductions:
+                self.assertEqual(len(kernels), 1)
+                expected_divisible = {
+                    # one kernel, with extra workspace/semaphore args
+                    0: (0, 1, 2, 3, 5),
+                }
             else:
-                self.assertTrue(len(kernels) == 2, "SUM should result in two kernels")
+                self.assertEqual(len(kernels), 2)
 
-            # kernel0 reduces from 256 to (xnumel=8, rnumel=8192), which means it reduces 256 by 256 into an array of
-            # size 8 by accumulating 8192 elements at once note that rnumel is equal to 512 * 16, so rnumel which is
-            # at slot 3 should be in the divisible by 16 descriptor
-            arguments_that_are_divisible_by_16_in_kernel0 = (
-                kernels[0].triton_meta["configs"][0].divisible_by_16
-            )
-            self.assertEqual(arguments_that_are_divisible_by_16_in_kernel0, (0, 1, 3))
+            for kernel_id, expected in expected_divisible.items():
+                divisible_by_16 = (
+                    kernels[kernel_id].triton_meta["configs"][0].divisible_by_16
+                )
+                self.assertEqual(divisible_by_16, expected)
 
-            # kernel1 reduces from 8 elements to a single scalar.
-            # Since multi-kernel generate 2 variants for each kernel. The second
-            # persistent-reduction has index 2.
-            kernel1_index = 2 if config.triton.multi_kernel else 1
-            arguments_that_are_divisible_by_16_in_kernel1 = (
-                kernels[kernel1_index].triton_meta["configs"][0].divisible_by_16
-            )
-            self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
             torch._dynamo.reset()
 
         @config.patch(assume_aligned_inputs=False)
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index d9d6c7415fd29..f4cdedda1b2e2 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -315,6 +315,11 @@ def test_reduction(
         full = torch.randn(full_size).to(device)
         view = torch.as_strided(full, view_size, full.stride())
 
+        if num_triton_kernels == 2 and config.triton.cooperative_reductions:
+            # fewer kernels with cooperative reductions
+            num_triton_kernels = 1
+            num_block_pointers -= 2
+
         # Expect at least 1 block pointer for the input.
         # Add 2 more if we generate 2 kernels.
         result, (code,) = self.run_and_compare(
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 2329cc1aba9ab..572128806d342 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -1457,7 +1457,7 @@ def semaphores(self, min_size: sympy.Expr):
         arg = WorkspaceArg(
             count=min_size,
             zero_mode=WorkspaceZeroMode.ZERO_PER_GRAPH,
-            dtype=torch.int32,
+            dtype=torch.uint32,
             inner_name="sem_ptr",
             outer_name=f"semaphores_{current_device.type}_{current_device.index}",
             device=current_device,
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index 27a043d785443..d623f8f082dc4 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -1660,7 +1660,7 @@ class HalideScheduling(SIMDScheduling):
     int32_type = "hl.Int(32)"
     # TODO(jansel): Halide doesn't actually support 64 bit indexing...
     int64_type = "hl.Int(64)"
-    kernel_type = HalideKernel  # type: ignore[arg-type]
+    kernel_type = HalideKernel  # type: ignore[arg-type,assignment]
 
     @classmethod
     def get_backend_features(cls, device: torch.device):
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index b952a2d4529aa..01d66c2997fa5 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -330,6 +330,7 @@ def __init__(
         pid_cache=None,
         reduction_hint=ReductionHint.DEFAULT,
         override_persistent_reduction=None,
+        override_cooperative_reduction=None,
     ) -> None:
         if pid_cache is None:
             pid_cache = {}
@@ -348,6 +349,11 @@ def __init__(
         self.index_dtype: str = index_dtype
         self.last_usage: OrderedSet[str] = OrderedSet()
         self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
+        self.cooperative_reduction: bool = (
+            override_cooperative_reduction
+            if override_cooperative_reduction is not None
+            else self.should_use_cooperative_reduction()
+        )
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
@@ -421,6 +427,9 @@ def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
         finally:
             self.inside_reduction = prior
 
+    def should_use_cooperative_reduction(self) -> bool:
+        return False  # defined in subclass
+
     def should_use_persistent_reduction(self) -> bool:
         return False  # defined in subclass
 
@@ -506,7 +515,7 @@ def set_last_usage(self, nodes):
         )
 
     def disable_reduction(self):
-        should_flush = self.range_trees[-1].is_loop
+        should_flush = self.range_trees[-1].is_loop or self.cooperative_reduction
 
         @contextlib.contextmanager
         def ctx():
@@ -1325,6 +1334,7 @@ def get_kernel_args(self, node_schedule, numel, reduction_numel):
     def codegen_node_schedule(
         self, node_schedule, buf_accesses, numel, reduction_numel
     ):
+        from torch._inductor.codegen.triton import TritonKernel
         from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
 
         tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
@@ -1334,7 +1344,8 @@ def codegen_node_schedule(
             index_dtype,
         ) = self.get_kernel_args(node_schedule, numel, reduction_numel)
 
-        is_split_scan = any(
+        is_scan = schedule_contains_op(node_schedule, "scan")
+        is_split_scan = is_scan and any(
             isinstance(node, BaseSchedulerNode) and node.is_split_scan()
             for node in node_schedule
         )
@@ -1349,6 +1360,10 @@ def codegen_node_schedule(
             index_dtype=index_dtype,
         )
 
+        if is_scan and kernel_type == TritonKernel:
+            # TODO(jansel): scan does not yet work with cooperative reductions
+            kernel_kwargs["override_cooperative_reduction"] = False
+
         # ops.sort only works with persistent reduction, and is not bandwidth bound anyway
         # so taking the hit of non-coalesced loads is okay
         if has_sort := schedule_contains_op(node_schedule, "sort"):
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index bae35bf3c2cbe..bad4cbfa38f2f 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1,6 +1,8 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
+import collections
+import contextlib
 import dataclasses
 import functools
 import itertools
@@ -29,8 +31,15 @@
 import torch._inductor.metrics as metrics
 import torch._logging
 from torch._dynamo.utils import preserve_rng_state
-from torch._inductor.runtime.hints import AutotuneHint, DeviceProperties
-from torch._inductor.runtime.triton_heuristics import grid as default_grid_fn
+from torch._inductor.runtime.hints import (
+    AutotuneHint,
+    DeviceProperties,
+    TRITON_MAX_RSPLIT,
+)
+from torch._inductor.runtime.triton_heuristics import (
+    cooperative_reduction_grid,
+    grid as default_grid_fn,
+)
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
@@ -1318,6 +1327,42 @@ def __add__(self, other: BlockParameters) -> BlockParameters:
         return cls(**{key: a[key] + b[key] for key in a})
 
 
+class CooperativeReductionWorkspaceCache:
+    """
+    The scratch space used for cooperative reductions can be reused
+    after two reduction loops.  This keeps track of what can be reused.
+    """
+
+    def __init__(self, args):
+        self.args = args
+        self.current_loop = []
+        self.prior_loop = []
+        self.ready_for_reuse = collections.defaultdict(collections.deque)
+        self.loop_count = 0
+        self.store_count = 0
+
+    def allocate(self, nbytes: sympy.Expr):
+        cached = self.ready_for_reuse.get(nbytes)
+        if cached:
+            return cached.popleft()
+        ws_name, ws_offset = self.args.workspace(nbytes, False)
+        self.current_loop.append((nbytes, ws_name, ws_offset))
+        return (ws_name, ws_offset)
+
+    def on_loop_end(self):
+        # Buffers can be reused after 2 loop ends
+        for nbytes, ws_name, ws_offset in self.prior_loop:
+            self.ready_for_reuse[nbytes].append((ws_name, ws_offset))
+        self.prior_loop = self.current_loop
+        self.current_loop = []
+        self.loop_count += 1
+
+    def increment_store_count(self):
+        prior = self.store_count
+        self.store_count += 1
+        return prior
+
+
 class TritonKernel(SIMDKernel):
     overrides = TritonKernelOverrides  # type: ignore[assignment]
     helper_functions: HelperFunctions
@@ -1333,9 +1378,13 @@ def __init__(
         reduction_hint=ReductionHint.DEFAULT,
         min_elem_per_thread=0,
         override_persistent_reduction=None,
+        override_cooperative_reduction=None,
         optimize_mask=True,
     ) -> None:
         self.optimize_mask: bool = optimize_mask
+        if pid_cache or override_persistent_reduction:
+            # foreach kernels don't work with cooperative reductions
+            override_cooperative_reduction = False
         super().__init__(
             *groups,
             index_dtype=index_dtype,
@@ -1343,8 +1392,10 @@ def __init__(
             reduction_hint=reduction_hint,
             pid_cache=pid_cache,
             override_persistent_reduction=override_persistent_reduction,
+            override_cooperative_reduction=override_cooperative_reduction,
         )
-        self.suffix: IndentedBuffer = IndentedBuffer()  # type: ignore[assignment]
+        self.post_loop_combine: IndentedBuffer = IndentedBuffer()
+        self.post_loop_store: IndentedBuffer = IndentedBuffer()
         self.outside_loop_vars: OrderedSet[Any] = OrderedSet()
         self.min_elem_per_thread = min_elem_per_thread
         self.block_ptr_id = itertools.count()
@@ -1354,8 +1405,64 @@ def __init__(
         self.autotune_hints: OrderedSet[AutotuneHint] = OrderedSet()
         self.triton_meta: Optional[Dict[str, object]] = None
 
+        if self.cooperative_reduction:
+            self.init_cooperative_reduction()
+
         self.codegen_range_tree()
 
+    def should_use_cooperative_reduction(self) -> bool:
+        """Heuristic to decide self.cooperative_reduction should be used."""
+        if not self.inside_reduction:
+            return False
+        if config.triton.force_cooperative_reductions:
+            return True
+        if (
+            not config.triton.cooperative_reductions
+            or V.graph.get_current_device_or_throw().type == "cpu"
+        ):
+            return False
+
+        xnumel, rnumel = self.numels
+        # TODO(jansel): base this on num_bytes_read rather than numel
+        xhint = V.graph.sizevars.size_hint(xnumel, fallback=2)
+        if xhint <= 8:
+            threshold = 32768 * xhint
+        elif xhint <= 16:
+            threshold = 2097152
+        else:
+            return False
+        # TODO(jansel): should this default on for dynamic shapes?
+        return V.graph.sizevars.statically_known_geq(rnumel, threshold)
+
+    def init_cooperative_reduction(self):
+        """One time setup code for cooperative reductions."""
+        assert self.cooperative_reduction
+
+        # shift all the grids over since tl.program_id(0) is for rsplit
+        for tree in self.range_trees:
+            if tree.grid_dim is not None:
+                tree.grid_dim += 1
+
+        xnumel, rnumel = self.numels
+        self.semaphores_name = self.args.semaphores(xnumel)
+        self.cooperative_reduction_workspace_cache = CooperativeReductionWorkspaceCache(
+            self.args
+        )
+        self.body.splice(
+            """
+            rsplit_id = tl.program_id(0)
+            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
+            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
+            rsplit_start = rsplit_chunk * rsplit_id
+            rsplit_end = rsplit_chunk * (rsplit_id + 1)
+            """,
+            strip=True,
+        )
+        if not self._has_constant_mask(self.range_trees[-1]):
+            self.body.writeline(
+                "rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel)"
+            )
+
     def codegen_range_tree(self):
         for tree in self.range_trees:
             # reduction indexing goes inside a loop
@@ -1383,7 +1490,10 @@ def should_use_persistent_reduction(self) -> bool:
         Heuristic to set self.persistent_reduction and add guards
         if needed.
         """
-        if not (self.inside_reduction and config.triton.persistent_reductions):
+        if (
+            not (self.inside_reduction and config.triton.persistent_reductions)
+            or self.cooperative_reduction
+        ):
             return False
         threshold = {
             ReductionHint.INNER: 1024,
@@ -1969,6 +2079,11 @@ def store(
             line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str}, sem='relaxed')"
         else:
             raise NotImplementedError(f"store mode={mode}")
+
+        exit_stack = contextlib.ExitStack()
+        if not self.inside_reduction and self.cooperative_reduction:
+            exit_stack.enter_context(self.guard_cooperative_store(name, self.stores))
+
         self.stores.writeline(DeferredLine(name, line))
         if advance_block_ptr:
             self.stores.writeline(advance_block_ptr)
@@ -1976,6 +2091,17 @@ def store(
         if not self.inside_reduction:
             self.outside_loop_vars.add(value)
 
+        exit_stack.close()
+
+    def guard_cooperative_store(self, name, buffer):
+        """
+        For cooperative reductions only one thread block should write out the result.
+        We rotate which thread block does each write for better parallelism
+        """
+        idx = self.cooperative_reduction_workspace_cache.increment_store_count()
+        buffer.writeline(DeferredLine(name, f"if rsplit_id == ({idx} % RSPLIT):"))
+        return buffer.indent()
+
     def bucketize(
         self,
         values: CSEVariable,
@@ -2084,8 +2210,8 @@ def final_reduction(value):
         def final_argreduce(buffer, result_var, value, index):
             buffer.splice(
                 f"""\
-                _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
-                {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
+                {result_var}_val, {result_var}_idx = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
+                {result_var} = {self.reduction_resize(f'{result_var}_idx')}
                 """
             )
 
@@ -2177,7 +2303,9 @@ def _mask_value(value, default):
                 {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)}
                 """
                 )
-                final_argreduce(self.suffix, result_var, accumulator, accumulator_index)
+                final_argreduce(
+                    self.post_loop_combine, result_var, accumulator, accumulator_index
+                )
             elif is_welford_reduction(reduction_type):
                 accumulator = f"{result_var}_mean"
                 accumulator_m2 = f"{result_var}_m2"
@@ -2223,17 +2351,16 @@ def _mask_value(value, default):
                 result_mean = result_var
                 result_m2 = self.cse.newvar(dtype=dtype)
                 result_weight = self.cse.newvar(dtype=dtype)
-                self.suffix.splice(
-                    f"""\
-                {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford(
-                    {accumulator}, {accumulator_m2}, {accumulator_weight}, {dim}
+                result_var = self.welford_reduce_final_reduction(
+                    self.post_loop_combine,
+                    result_mean,
+                    result_m2,
+                    result_weight,
+                    accumulator,
+                    accumulator_m2,
+                    accumulator_weight,
+                    dim,
                 )
-                {result_mean} = {self.reduction_resize(f'{result_mean}_tmp')}
-                {result_m2} = {self.reduction_resize(f'{result_m2}_tmp')}
-                {result_weight} = {self.reduction_resize(f'{result_weight}_tmp')}
-                """
-                )
-                result_var = result_mean, result_m2, result_weight
             else:
                 combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
                 updated = combine_fn(accumulator, value)
@@ -2250,14 +2377,63 @@ def _mask_value(value, default):
                     # which is needed because tl.reduce doesn't support tl.int1
                     accumulator = f"{accumulator}.to(tl.int8)"
                     result_type = triton_compute_type(dtype)
-                    self.suffix.writeline(
+                    self.post_loop_combine.writeline(
                         f"{result_var} = {final_reduction(accumulator)}.to({result_type})"
                     )
                 else:
-                    self.suffix.writeline(
+                    self.post_loop_combine.writeline(
                         f"{result_var} = {final_reduction(accumulator)}"
                     )
 
+        if self.cooperative_reduction:
+            exit_stack = contextlib.ExitStack()
+            for buf in (self.post_loop_combine, self.post_loop_store):
+                # only do cooperative reduction combines if we have more than one thread block
+                buf.writeline("if RSPLIT > 1:")
+                exit_stack.enter_context(buf.indent())
+
+            if reduction_type in {"argmax", "argmin"}:
+                self.post_loop_combine.writeline(
+                    f"{result_var}_bval = {self.reduction_resize(f'{result_var}_val')}"
+                )
+                peer_val = self.codegen_cooperative_reduction_peer_combine(
+                    f"{result_var}_bval", src_dtype
+                )
+                peer_idx = self.codegen_cooperative_reduction_peer_combine(
+                    result_var, dtype
+                )
+                final_argreduce(self.post_loop_store, result_var, peer_val, peer_idx)
+            elif is_welford_reduction(reduction_type):
+                assert reduction_type == "welford_reduce"
+                result_mean, result_m2, result_weight = result_var
+                peer_mean = self.codegen_cooperative_reduction_peer_combine(
+                    result_mean, upcast_acc_dtype(src_dtype)
+                )
+                peer_m2 = self.codegen_cooperative_reduction_peer_combine(
+                    result_m2, upcast_acc_dtype(src_dtype)
+                )
+                peer_weight = self.codegen_cooperative_reduction_peer_combine(
+                    result_weight, upcast_acc_dtype(src_dtype)
+                )
+                self.welford_reduce_final_reduction(
+                    self.post_loop_store,
+                    result_mean,
+                    result_m2,
+                    result_weight,
+                    peer_mean,
+                    peer_m2,
+                    peer_weight,
+                    dim,
+                )
+            else:
+                peers = self.codegen_cooperative_reduction_peer_combine(
+                    result_var, upcast_acc_dtype(src_dtype)
+                )
+                self.post_loop_store.writeline(
+                    f"{result_var} = {final_reduction(peers)}"
+                )
+            exit_stack.close()
+
         self.cse.reduction_cache[cache_key] = result_var
 
         if isinstance(result_var, tuple):
@@ -2269,6 +2445,56 @@ def _mask_value(value, default):
 
         return result_var
 
+    def welford_reduce_final_reduction(
+        self,
+        buf,
+        result_mean,
+        result_m2,
+        result_weight,
+        accumulator,
+        accumulator_m2,
+        accumulator_weight,
+        dim,
+    ):
+        """Helper to codegen call to triton_helpers.welford"""
+        buf.splice(
+            f"""\
+            {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford(
+                {accumulator}, {accumulator_m2}, {accumulator_weight}, {dim}
+            )
+            {result_mean} = {self.reduction_resize(f'{result_mean}_tmp')}
+            {result_m2} = {self.reduction_resize(f'{result_m2}_tmp')}
+            {result_weight} = {self.reduction_resize(f'{result_weight}_tmp')}
+            """
+        )
+        return result_mean, result_m2, result_weight
+
+    def codegen_cooperative_reduction_peer_combine(self, result_var, dtype):
+        """
+        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
+        column.  After the barrier, every thread block loads the completed value so that it can compute the final
+        value independently.
+        """
+        xnumel, rnumel = self.numels
+        mask = "xindex < xnumel" if xnumel != 1 and not self.no_x_dim else None
+        expand = "" if self.no_x_dim else "[None,:]"
+
+        nbytes = xnumel * dtype.itemsize * TRITON_MAX_RSPLIT
+        ws_name, ws_offset = self.cooperative_reduction_workspace_cache.allocate(nbytes)
+
+        self.post_loop_combine.splice(
+            f"""
+                {result_var}_ws = ({ws_name} + {self.index_to_str(ws_offset)}).to(tl.pointer_type({triton_type(dtype)}))
+                tl.store({result_var}_ws + (xindex * RSPLIT + rsplit_id), {result_var}, {mask})
+            """,
+            strip=True,
+        )
+        self.post_loop_store.writeline(
+            f"{result_var}_peers = tl.load({result_var}_ws + (xindex * RSPLIT + tl.arange(0, RSPLIT){expand}), "
+            f"{mask}, eviction_policy='evict_first')"
+        )
+        return f"{result_var}_peers"
+
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
         assert self.inside_reduction
         self.inside_reduction = False
@@ -2276,8 +2502,14 @@ def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
         self.inside_reduction = True
         var = self.args.output(name)
 
+        exit_stack = contextlib.ExitStack()
+        if self.cooperative_reduction:
+            exit_stack.enter_context(
+                self.guard_cooperative_store(name, self.post_loop_store)
+            )
+
         if isinstance(indexing, BlockPtrOptions):
-            self.suffix.writeline(
+            self.post_loop_store.writeline(
                 DeferredLine(
                     name,
                     self.codegen_block_ptr_store_line(
@@ -2291,13 +2523,15 @@ def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
             )
         else:
             assert isinstance(indexing, IndexingOptions)
-            self.suffix.writeline(
+            self.post_loop_store.writeline(
                 DeferredLine(
                     name,
                     f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
                 )
             )
 
+        exit_stack.close()
+
     def _lift_helper(self, fn, num_args) -> str:
         # Lift IR function for scan operations into a triton function
         # in the global namespace
@@ -2345,11 +2579,11 @@ def scan(
         values: Tuple[CSEVariable, ...],
     ) -> Tuple[CSEVariable, ...]:
         assert self.inside_reduction
+        assert not self.cooperative_reduction, "TODO"
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
         self.filter_masks(masks)
         masks = sorted(masks)
         assert not self._load_mask, "ops.scan not supported inside ops.masked"
-        reduction_range_prefix = self.range_trees[-1].prefix
 
         broadcasted_values = []
         accumulators = []
@@ -2359,9 +2593,6 @@ def scan(
         dim = self.triton_tensor_ndim() - 1
 
         for value, dtype in zip(values, dtypes):
-            acc_type = triton_acc_type(dtype)
-            cond = " & ".join(masks)
-
             value_dtype = self.cse.generate(
                 self.compute,
                 f"{value}.to({triton_compute_type(dtype)})",
@@ -2375,7 +2606,6 @@ def scan(
             broadcasted_values.append(value)
 
             acc_type = triton_acc_type(dtype)
-            cond = " & ".join(masks)
 
             if not self.persistent_reduction:
                 accumulator = self.cse.newvar(dtype=dtype)
@@ -2457,6 +2687,7 @@ def sort(
         descending: bool,
     ) -> Tuple[CSEVariable, ...]:
         assert self.inside_reduction
+        assert not self.cooperative_reduction, "TODO"
         masks = OrderedSet(f"{tree.prefix}mask" for tree in self.range_trees)
         self.filter_masks(masks)
         masks = sorted(masks)
@@ -2527,12 +2758,19 @@ def codegen_body(self):
             or self.loads
             or self.stores
             or self.compute
-            or self.suffix
+            or self.post_loop_combine
+            or self.post_loop_store
         ):
             return
 
         if self.inside_reduction and self.range_trees[-1].is_loop:
-            self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
+            if self.cooperative_reduction:
+                self.body.writeline(
+                    "for roffset in range(rsplit_start, rsplit_end, RBLOCK):"
+                )
+            else:
+                self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
+
             with self.body.indent():
                 # last range tree is always reduction
                 self.iteration_ranges_codegen_header(self.range_trees[-1], self.body)
@@ -2549,12 +2787,26 @@ def codegen_body(self):
             self.body.splice(self.loads)
             self.body.splice(self.compute)
             self.body.splice(self.stores)
-        self.body.splice(self.suffix)
+        self.body.splice(self.post_loop_combine)
+        if self.cooperative_reduction and (
+            self.post_loop_combine or self.post_loop_store
+        ):
+            sem_ptr = f"{self.semaphores_name} + tl.program_id(1)"
+            self.body.splice(
+                f"""
+                if RSPLIT > 1:
+                    triton_helpers.x_grid_barrier({sem_ptr})
+                """,
+                strip=True,
+            )
+            self.cooperative_reduction_workspace_cache.on_loop_end()
+        self.body.splice(self.post_loop_store)
         self.indexing_code.clear()
         self.loads.clear()
         self.compute.clear()
         self.stores.clear()
-        self.suffix.clear()
+        self.post_loop_combine.clear()
+        self.post_loop_store.clear()
 
     def codegen_kernel_benchmark(self, num_gb, grid=None):
         result = IndentedBuffer()
@@ -2674,7 +2926,9 @@ def imports_for_benchmark_kernel(self):
         )
 
     def _get_heuristic(self):
-        if self.persistent_reduction:
+        if self.cooperative_reduction:
+            return "cooperative_reduction"
+        elif self.persistent_reduction:
             assert self.inside_reduction
             return "persistent_reduction"
         elif self.inside_reduction:
@@ -2828,6 +3082,8 @@ def codegen_kernel(self, name=None):
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
+        if self.cooperative_reduction:
+            inductor_meta["persistent_reduction"] = self.persistent_reduction
 
         num_gb = None
         if config.benchmark_kernel or config.profile_bandwidth:
@@ -2867,6 +3123,9 @@ def codegen_kernel(self, name=None):
                 continue
             argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr")
 
+        if self.cooperative_reduction:
+            argdefs.append("RSPLIT : tl.constexpr")
+
         self.codegen_body()
 
         for helper in self.helper_functions:
@@ -2954,15 +3213,19 @@ def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexp
 
             if tree.prefix == "r" and self.persistent_reduction:
                 val = self._get_persistent_RBLOCK(tree.numel)
+                if self.cooperative_reduction:
+                    val = f"{val} // RSPLIT"
                 code.writeline(f"RBLOCK: tl.constexpr = {val}")
 
             if tree.prefix == "x" and self.no_x_dim:
                 code.writeline("XBLOCK: tl.constexpr = 1")
 
     def _get_grid_fn_str(self):
-        return "grid"
+        return self._get_grid_fn().__name__
 
     def _get_grid_fn(self):
+        if self.cooperative_reduction:
+            return cooperative_reduction_grid
         return default_grid_fn
 
     def add_numel_to_call_args_and_grid(self, name, call_args, arg_types, grid):
@@ -3038,8 +3301,14 @@ def iteration_ranges_ranges_code(self, entry):
         assert entry.tensor_dim is not None
         size = self.indexing_size_str(entry.tensor_dim)
         index_dtype = self.index_dtype
-        convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
-        return f"tl.arange(0, {entry.prefix.upper()}BLOCK){size}{convert}"
+        suffix = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
+        if (
+            self.cooperative_reduction
+            and self.persistent_reduction
+            and entry.prefix == "r"
+        ):
+            suffix = f"{suffix} + rsplit_start"
+        return f"tl.arange(0, {entry.prefix.upper()}BLOCK){size}{suffix}"
 
     def iteration_ranges_scalar_code(self, entry, value):
         index_dtype = self.index_dtype
@@ -3055,6 +3324,7 @@ def iteration_ranges_get_pid(self, entry):
         if (
             entry.grid_dim == 1
             and not entry.has_zdim
+            and not self.cooperative_reduction
             and not V.graph.sizevars.statically_known_leq(entry.numel, get_max_y_grid())
         ):
             # For ynumel larger than max_ygrid, we need to use zdim.
@@ -3071,6 +3341,7 @@ def _has_constant_mask(self, tree: IterationRangesRoot):
             return False
         if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
             return True
+
         # Masks are superfluous if numel is a multiple of BLOCK
         # (We use the fact that BLOCK is required by triton to be a power of 2)
         if tree.prefix == "r" and self.persistent_reduction:
@@ -3082,6 +3353,9 @@ def _has_constant_mask(self, tree: IterationRangesRoot):
                 return False
             max_block = TRITON_MAX_BLOCK[tree.prefix.upper()]
 
+        if tree.prefix == "r" and self.cooperative_reduction:
+            max_block = max_block * TRITON_MAX_RSPLIT
+
         # Optional optimization: if block divides numel exactly, we will
         # never need to do a masked load to handle stragglers at the end.
         # It's faster to avoid masking at all.  But it is sound to always
@@ -3155,6 +3429,14 @@ def __init__(self, scheduler: Scheduler) -> None:
 
     @classmethod
     def get_backend_features(cls, device: torch.device):
+        if (
+            config.triton.cooperative_reductions
+            or config.triton.force_cooperative_reductions
+        ):
+            return {
+                **cls.backend_features,
+                BackendFeature.REDUCE_TO_SINGLE_ELEMENT: None,
+            }
         return cls.backend_features
 
     def codegen_comment(self, node_schedule):
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
index 3ffe313aec4da..a2cebb90f92a5 100644
--- a/torch/_inductor/codegen/triton_split_scan.py
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -49,6 +49,9 @@ def __init__(
     def should_use_persistent_reduction(self) -> bool:
         return False
 
+    def should_use_cooperative_reduction(self) -> bool:
+        return False
+
     def initialize_range_tree(self, pid_cache):
         prefixes = "yxr"
         assert len(self.numels) <= len(
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index ec6d72b93bfe6..07f72a7593717 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -524,6 +524,7 @@ def add_import_once(line: str) -> None:
         self._meta_vars: Set[str] = set()
         self.multi_kernel_state = MultiKernelState()
         self.already_codegened_subgraphs: Set[str] = set()
+        self.allocated_workspaces: Dict[str, Any] = {}
 
         # intermediate tensor value printing utility
         self.debug_printer = DebugPrinterManager(
@@ -612,7 +613,14 @@ def write_triton_header_once(self) -> None:
         import_str = f"""
             import triton
             import triton.language as tl
-            from {triton_heuristics.__name__} import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
+            from {triton_heuristics.__name__} import (
+                grid,
+                split_scan_grid,
+                grid_combo_kernels,
+                start_graph,
+                end_graph,
+                cooperative_reduction_grid,
+            )
             """
         self.imports.splice(import_str, strip=True)
         if config.triton.autotune_at_compile_time:
@@ -1539,7 +1547,7 @@ def generate_workspace_allocation(self, ws: WorkspaceArg):
             self.writeline(line)
             self.writeline(self.make_zero_buffer(name))
         elif ws.zero_mode == WorkspaceZeroMode.ZERO_PER_GRAPH:
-            prior = V.graph.allocated_workspaces.get(name)
+            prior = self.allocated_workspaces.get(name)
             if prior:
                 assert isinstance(prior, AllocateLine)
                 # expand existing allocation
@@ -1547,7 +1555,7 @@ def generate_workspace_allocation(self, ws: WorkspaceArg):
             else:
                 self.writeline(line)
                 self.writeline(self.make_zero_buffer(name))
-                V.graph.allocated_workspaces[name] = line
+                self.allocated_workspaces[name] = line
         else:
             raise AssertionError(ws.zero_mode)
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 55563e5dd3521..82204f4b9e8e5 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -472,8 +472,12 @@ def use_autoheuristic(name: str) -> bool:
 # Convert 1x1 convs into matmuls
 conv_1x1_as_mm = False
 
-# Enable split reductions for better utilization when the dimension
-# being reduced over is large (by splitting it)
+# For reductions with a small output size (usually 1, e.g. x.sum()) there is not enough
+# parallelism to saturate the GPU.  We have two ways of handling this, either `split_reductions`
+# or `triton.cooperative_reductions` which are mutually exclusive.
+#   split_reductions: uses multiple kernels to gain more parallelism
+#   triton.cooperative_reductions: uses cross thread-block synchronization to gain more parallelism
+# enabling both of these will implicitly disable split_reductions
 split_reductions = True
 
 benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
@@ -972,6 +976,14 @@ class triton:
         os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
     )
 
+    # For small output size reductions uses cross thread-block synchronization to gain more parallelism
+    cooperative_reductions = (
+        os.environ.get("TORCHINDUCTOR_COOPERATIVE_REDUCTIONS", "0") == "1"
+    )
+
+    # used for debugging cooperative reduction codegen, always generate cooperative_reductions
+    force_cooperative_reductions = False
+
     # 0/False: disable
     # 1/True: enable, use tuning to pick between different subkernels
     # 2: enable, force using persistent reduction (for debugging)
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 9179de5a5f618..2bbab0ee2c7f4 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -475,8 +475,7 @@ def __init__(
         # Below field is related to printing debug intermediate tensor values info for debugging
         self.all_codegen_kernel_names: OrderedSet[str] = OrderedSet()
 
-        # state used by wrapper.generate_workspace_allocation()
-        self.allocated_workspaces: Dict[str, Any] = {}
+        # state used by for Kernel.workspace
         self.workspace_id = itertools.count()
 
     def has_feature(
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 695b94b27a737..c2975e8d03645 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -879,7 +879,7 @@ def _is_static(x):
         reduction_numel_hint = V.graph.sizevars.symbolic_hint(reduction_numel)
         numel_hint = V.graph.sizevars.symbolic_hint(sympy_product(ranges))
 
-        should_split = (
+        should_split = reduction_type == "scan" or (
             not V.graph.has_feature(device, BackendFeature.REDUCE_TO_SINGLE_ELEMENT)
             and reduction_type
             not in (
@@ -887,11 +887,9 @@ def _is_static(x):
                 "argmin",
             )
             and config.split_reductions
-            # We don't support unbacked symints
-            and _is_static(reduction_numel_hint)
-            and _is_static(numel_hint)
         )
-        if not should_split:
+        if not (_is_static(reduction_numel_hint) and _is_static(numel_hint)):
+            # We don't support unbacked symints
             return ReductionHint.DEFAULT, 1
 
         device_interface = get_interface_for_device(get_device_type(device))  # type: ignore[arg-type] # next PR
@@ -909,6 +907,8 @@ def _is_static(x):
         max_elements_per_device = max_elements_per_thread * num_sm * threads_per_sm
 
         def inner_reduction_splits(reduction_numel_hint, numel_hint):
+            if not should_split:
+                return 1
             # do heuristics that's close to eager mode for split inner reduction
             # we leak reduction autotune configs here, and will need to refactor to avoid this later
             num_warps = 8
@@ -945,6 +945,8 @@ def inner_reduction_splits(reduction_numel_hint, numel_hint):
             )
 
         def outer_reduction_splits(reduction_numel_hint, numel_hint):
+            if not should_split:
+                return 1
             # TODO the best heuristic currently has XBLOCK (corresponding to numel_hint) 128
             # extend to even smaller number of outputs
             num_warps = 8
@@ -1961,7 +1963,7 @@ def wrapper_fn(idx, reduction_idx):
             inner_fn=wrapper_fn,
             ranges=pointwise_ranges,
             reduction_ranges=scan_ranges,
-            reduction_type="sum",
+            reduction_type="scan",
             reduction_numel=scan_numel,
         )
 
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index cd62fde545084..0a52cf492a0ed 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -12,6 +12,7 @@
     "Z": 1024,
     "R": 4096 * 16,  # * 16 is multi-kernel only
 }
+TRITON_MAX_RSPLIT = 64
 
 
 class ReductionHint(Enum):
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
index faa1d3943c06d..020d593f7cf50 100644
--- a/torch/_inductor/runtime/triton_helpers.py
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -614,3 +614,36 @@ def select_one(x, mask, dim, keep_dims=False):
     ix = x.to(idtype, bitcast=True)
     iy = tl.sum(ix * mask, dim, keep_dims=keep_dims)
     return iy.to(x.dtype, bitcast=True)
+
+
+@triton.jit
+def x_grid_barrier(sem):
+    """
+    Wait for all other thread blocks in grid sharing same y/z program_id
+    to reach this barrier before returning.
+
+    Args:
+        sem: an uint32 semaphores, zero or 0x80000000 initialized.  Must be unique to each y/z program ID.
+    """
+    # ensure stores before this are visible
+    tl.debug_barrier()
+
+    one_i32 = 1
+    one_u32 = one_i32.to(tl.uint32)  # type: ignore[attr-defined]
+    expected = tl.num_programs(0).to(tl.uint32)
+    if tl.program_id(0) == 0:
+        nb = 0x80000000 - (expected - one_u32)
+    else:
+        nb = one_u32
+
+    old_arrive = tl.atomic_add(sem, nb, sem="release")
+
+    bar_flipped = False
+    while not bar_flipped:
+        # want a `ld.acquire.gpu.u32 $0,[$1];` but Triton doesn't have it
+        # current_arrive = tl.atomic_add(sem, 0, sem="acquire")
+        current_arrive = tl.load(sem, volatile=True)  # is missing .acquire
+        bar_flipped = ((old_arrive ^ current_arrive) & 0x80000000) != 0
+
+    # TODO(jansel): is this needed?
+    tl.debug_barrier()
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 6a55df5fe3944..cedec8efa4141 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -30,6 +30,7 @@
     ReductionHint,
     TileHint,
     TRITON_MAX_BLOCK,
+    TRITON_MAX_RSPLIT,
 )
 from .runtime_utils import (
     cache_dir,
@@ -288,6 +289,7 @@ def precompile(self, warm_cache_only=False):
 
             if (
                 self.inductor_meta.get("dynamic_scale_rblock", True)
+                and not self.inductor_meta.get("persistent_reduction")
                 and self.heuristic_type == HeuristicType.REDUCTION
                 and self.size_hints is not None
                 # Disable for Intel as Triton is not ready to return n_regs for a compiled_binary.
@@ -1633,18 +1635,51 @@ def reduction(
     )
 
 
-def persistent_reduction(
+def cooperative_reduction(
     size_hints,
-    reduction_hint=False,
-    triton_meta=None,
-    filename=None,
-    inductor_meta=None,
+    reduction_hint,
+    triton_meta,
+    filename,
+    inductor_meta,
 ):
     inductor_meta = {} if inductor_meta is None else inductor_meta
     inductor_meta["reduction_hint"] = reduction_hint
     if inductor_meta.get("no_x_dim"):
         size_hints = [1, *size_hints[1:]]
+    xnumel, rnumel = size_hints
+
+    # TODO(jansel): we should base target on the SM count of the local GPU
+    target = 64
+    split = max(1, min(target // xnumel, TRITON_MAX_RSPLIT))
+    assert rnumel >= split
+    assert split <= TRITON_MAX_RSPLIT
+    if inductor_meta["persistent_reduction"]:
+        configs = _persistent_reduction_configs(
+            [xnumel, rnumel // split], reduction_hint, inductor_meta
+        )
+    else:
+        configs = _reduction_configs(
+            size_hints=[xnumel, rnumel // split], inductor_meta=inductor_meta
+        )
+    for config in configs:
+        config.kwargs["RSPLIT"] = split
+    # TODO(jansel): add more configs in max_autotune
+
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.REDUCTION,
+        filename=filename,
+    )
+
 
+def _persistent_reduction_configs(
+    size_hints,
+    reduction_hint=False,
+    inductor_meta=None,
+):
     xnumel, rnumel = size_hints
 
     configs = [
@@ -1671,6 +1706,23 @@ def persistent_reduction(
     if disable_pointwise_autotuning(inductor_meta):
         configs = configs[:1]
 
+    return configs
+
+
+def persistent_reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    configs = _persistent_reduction_configs(size_hints, reduction_hint, inductor_meta)
+
     return cached_autotune(
         size_hints,
         configs,
@@ -1825,6 +1877,15 @@ def grid_fn(meta):
     return grid_fn
 
 
+def cooperative_reduction_grid(xnumel):
+    def grid_fn(meta):
+        return (meta["RSPLIT"], ceildiv(xnumel, meta.get("XBLOCK", 1)), 1)
+
+    grid_fn_str = f"cooperative_reduction_grid({xnumel})"
+    setattr(grid_fn, "grid_fn_str", grid_fn_str)  # noqa: B010
+    return grid_fn
+
+
 def split_scan_grid(xnumel, rnumel):
     def grid_fn(meta):
         assert meta.get("XBLOCK", 1) == 1

From ee11e2da1ebb618ee814df052d66e043c9268e25 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Fri, 25 Oct 2024 16:59:20 -0700
Subject: [PATCH 142/161] [PGNCCL] Use non-blocking mode by default in eager
 init (#138527)

### Why use non-blocking mode in eager init?
For overlapping comm init and model init, etc.
![image](https://github.com/user-attachments/assets/9b0bf7a9-be26-4d16-827b-dbe861f083cd)

### Why can we set non-blocking as default?
If the setting is dangling -- i.e. not passed in by user nor set via env -- `ProcessGroupNCCL` can have some preferred logic. And torch-level API semantics does not change whether the NCCL comm is blocking or non-blocking (handled within `ProcessGroupNCCL`).

### Why not make non-blocking default for lazy mode as well?
PR https://github.com/pytorch/pytorch/pull/137544 tried it.
Two reasons why that's not preferred today:
1. It is hard -- too big a blast.
2. There is no gain by doing lazy init in non-blocking mode, because the right next CPU call is a collective, and we will block there waiting for comm to be ready, so same effect as blocked init, no "opening" compared to eager mode.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138527
Approved by: https://github.com/wconstab
ghstack dependencies: #138860
---
 test/distributed/test_c10d_nccl.py            | 64 +++++++++----------
 torch/csrc/cuda/nccl.cpp                      |  7 +-
 torch/csrc/distributed/c10d/NCCLUtils.cpp     | 12 +---
 torch/csrc/distributed/c10d/NCCLUtils.hpp     | 37 ++++++-----
 .../distributed/c10d/ProcessGroupNCCL.cpp     | 48 ++++++++++++--
 .../distributed/c10d/ProcessGroupNCCL.hpp     |  8 +++
 6 files changed, 106 insertions(+), 70 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 6d81901a7a66c..64a210ed3b6c0 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -321,25 +321,30 @@ def abortpg():
 
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_close_pg(self):
+    @parametrize("eager_init", [True, False])
+    def test_close_pg(self, eager_init: bool):
         # Disable ASYNC_ERROR_HANDLING for this test to ensure we can programmatically
         # abort the process group.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
 
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-        device = self.rank_to_GPU[self.rank][0]
+        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device if eager_init else None,
+        )
 
         t = torch.rand(10, 10, device=device)
         # First allreduce to initialize state.
-        pg.allreduce(t)
+        dist.all_reduce(t)
 
         # Destroy pg and validate pg is no longer valid
         dist.destroy_process_group()
-        with self.assertRaises(dist.DistBackendError):
-            pg.allreduce([t])
-
-        del pg
+        with self.assertRaises(ValueError):
+            dist.all_reduce(t)
 
     CUDA_12_AND_ABOVE = torch.cuda.is_available() and (
         torch.version.cuda is not None and int(torch.version.cuda.split(".")[0]) >= 12
@@ -803,27 +808,24 @@ def test_extend_nccl_pg_timeout(self, backend):
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
-    def test_comm_lazy_init_split(self):
+    @parametrize("eager_init", [True, False])
+    def test_new_group(self, eager_init: bool):
         # Test the optimization of new groups that contain all world
         # ranks use the "transparent" `ncclCommSplit` optimization.
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = self._create_process_group_nccl(store, self.opts())
-
-        # Test lazy splitting behavior across each per-device backend.
-        for device in self.rank_to_GPU[self.rank]:
-            backend = pg._get_backend(torch.device(device))
-
-            # split doesn't happen unless the original process group has lazily
-            # created communicators, so first verify we haven't split even when
-            # making the new group and running an operation on the original pg.
-            ng = c10d.new_group()
-            tensor = torch.tensor([self.rank]).cuda(device)
-            pg.broadcast(tensor, 0)
-            self.assertEqual(backend.comm_split_count(), 0)
-
-            # The new group will not force a split because it is a lazy init.
-            ng.broadcast(tensor, 0)
-            self.assertEqual(backend.comm_split_count(), 0)
+        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device if eager_init else None,
+        )
+        ng = c10d.new_group()
+        tensor = torch.tensor([self.rank], device=device)
+        dist.broadcast(tensor, 0)
+        dist.broadcast(tensor, 0, group=ng)
+        dist.destroy_process_group()
 
     @requires_nccl_version((2, 18), "Need NCCL 2.18+ for ncclCommSplit")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
@@ -863,15 +865,11 @@ def test_comm_eager_init_subgroup(self):
         pg = self._create_process_group_nccl(store, self.opts())
         backend = pg._get_backend(torch.device(device))
         self.assertEqual(backend._is_initialized(), False)
-
-        tensor = torch.full((1,), self.rank).cuda(device)
+        # create a subgroup eagerly
         new_group = c10d.new_group([0, 1], device_id=device)
-        self.assertEqual(backend.comm_split_count(), 0)
-
-        new_backend = new_group._get_backend(torch.device(device))
-        self.assertEqual(new_backend._is_initialized(), True)
+        tensor = torch.full((1,), self.rank).cuda(device)
         dist.broadcast(tensor, 0, group=new_group)
-        self.assertEqual(new_backend.comm_split_count(), 0)
+        # the default group should stay lazy
         self.assertEqual(backend._is_initialized(), False)
         torch.cuda.synchronize()
         dist.destroy_process_group()
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index a426d9043fa66..7be7b08efc6a6 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -159,7 +159,6 @@ static inline void NCCL_CHECK(ncclResult_t result) {
 }
 
 // TODO(eqy): can this duplication be avoided from NCCLUtils.cpp?
-// Default value: on
 bool nccl_use_nonblocking() {
   static bool nccl_use_nonblocking_ =
       c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
@@ -194,7 +193,8 @@ static inline void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
                            currentTimepoint - startTimepoint)
                            .count();
     if (timeElapsed > nccl_nonblocking_timeout()) {
-      throw std::runtime_error("NCCL timeout.");
+      throw std::runtime_error(
+          "NCCL timeout when waiting for nonblocking call to become successful.");
     }
     sched_yield(); // yield to other threads
     ncclCommGetAsyncError(to_nccl_comm(comm), &result);
@@ -226,7 +226,8 @@ static inline void NCCL_CHECK_TIMEOUT(
                                currentTimepoint - startTimepoint)
                                .count();
         if (timeElapsed > nccl_nonblocking_timeout()) {
-          throw std::runtime_error("NCCL timeout.");
+          throw std::runtime_error(
+              "NCCL timeout when waiting for nonblocking call to become successful.");
         }
         sched_yield(); // yield to other threads
         ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index e5fb9abacdb88..00bd235c86666 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -31,7 +31,7 @@ ncclComm_t NCCLComm::getNcclComm() {
             commFailureMsg));
   }
   // In non-blocking mode, ensure comm is ready.
-  if (nccl_use_nonblocking()) {
+  if (nonBlocking_) {
     // If timeout is reached, throw an exception.
     C10D_NCCL_CHECK_TIMEOUT_SLEEP(ncclInProgress, ncclComm_, std::nullopt);
     // ncclComm_ should be initialized by now
@@ -101,6 +101,7 @@ std::shared_ptr<NCCLComm> NCCLComm::split(
 #endif
   ++source->ncclCommSplitCounter_;
   comm->rank_ = rank;
+  comm->nonBlocking_ = config.blocking == 0;
   LOG(INFO) << "Rank " << source->rank_ << ": created child comm "
             << comm->repr() << " with color_id " << color_id;
   return comm;
@@ -163,15 +164,6 @@ size_t hashTensors(const std::vector<at::Tensor>& tensors) {
 }
 #endif
 
-bool nccl_use_nonblocking() {
-  static bool nccl_use_nonblocking_ =
-      c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
-  if (nccl_use_nonblocking_) {
-    TORCH_WARN_ONCE("Using experimental non-blocking NCCL communicator.");
-  }
-  return nccl_use_nonblocking_;
-}
-
 // Default value: 30 minutes
 int nccl_nonblocking_timeout() {
   static int timeout = -2; // -2 means not initialized
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 27b8b8f8e9547..0089d453bb85a 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -236,7 +236,6 @@ DEFINE_CONSTANT(started_state, "started");
 TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
 TORCH_API std::string getNcclVersion();
 TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
-bool nccl_use_nonblocking();
 int nccl_nonblocking_timeout();
 
 // Provides additional detail into NCCL error codes based on when these are
@@ -311,6 +310,8 @@ class NCCLComm {
     comm->ncclId_ = commId;
     comm->rank_ = rank;
     comm->initialized_ = true;
+    // Old style comm is always blocking.
+    comm->nonBlocking_ = false;
     return comm;
   }
 
@@ -321,26 +322,19 @@ class NCCLComm {
       ncclUniqueId commId,
       ncclConfig_t& config) {
     auto comm = std::make_shared<NCCLComm>();
-    bool isInitialized = false;
-    if (nccl_use_nonblocking()) {
-      config.blocking = 0;
-      LOG(INFO) << "Rank " << rank
-                << ": creating NCCL communicator in nonblocking mode";
-      C10D_NCCL_CHECK_NONBLOCKING(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          std::nullopt);
-    } else {
-      C10D_NCCL_CHECK(
-          ncclCommInitRankConfig(
-              &(comm->ncclComm_), numRanks, commId, rank, &config),
-          std::nullopt);
-      // under blocking mode, comm is initialized after NCCL CHECK
-      isInitialized = true;
-    }
+    comm->nonBlocking_ = config.blocking == 0;
+    LOG(INFO) << "Rank " << rank << ": creating NCCL communicator with mode: "
+              << (comm->nonBlocking_ ? "nonblocking" : "blocking");
+    C10D_NCCL_CHECK_NONBLOCKING(
+        ncclCommInitRankConfig(
+            &(comm->ncclComm_), numRanks, commId, rank, &config),
+        std::nullopt);
     comm->ncclId_ = commId;
     comm->rank_ = rank;
-    comm->initialized_ = isInitialized;
+    // Under blocking mode, comm is initialized immediately after NCCL init
+    // returns; Under nonblocking mode, we check whether comm is initialized the
+    // *next* time ncclComm_ is accessed.
+    comm->initialized_ = !comm->nonBlocking_;
     return comm;
   }
 
@@ -385,6 +379,7 @@ class NCCLComm {
     std::swap(aborted_, other.aborted_);
     std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
     std::swap(initialized_, other.initialized_);
+    std::swap(nonBlocking_, other.nonBlocking_);
   }
 
   ncclComm_t getNcclComm();
@@ -553,6 +548,10 @@ class NCCLComm {
   // better error messaging.
   std::optional<std::string> commFailureReason_{};
   bool initialized_{false};
+  // Whether this communicator is using nonblocking mode. Recorded during comm
+  // creation or split. For safety, we give a default value of true (more
+  // protection).
+  bool nonBlocking_{true};
 #ifdef NCCL_HAS_COMM_REGISTER
   // Stores handlers for tensors registered by NCCL
   std::unordered_map<void*, void*> registeredSegmentHandles_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 6206b4d6c5994..c9564a31f057c 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -987,7 +987,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
             << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
             << ", TORCH_DISTRIBUTED_DEBUG: " << torch_distributed_debug
-            << ", TORCH_NCCL_USE_COMM_NONBLOCKING: " << nccl_use_nonblocking()
 #ifdef NCCL_HAS_COMM_REGISTER
             << ", TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: "
             << useTensorRegisterAllocatorHook_
@@ -1059,6 +1058,39 @@ void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
   getNCCLComm(key, device, OpType::ALLREDUCE);
 }
 
+bool ProcessGroupNCCL::useNonblocking() {
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  return false;
+#endif
+  // Already parsed, return the cached value
+  if (useNonblocking_.has_value()) {
+    return useNonblocking_.value();
+  }
+  // Get environment variable.
+  auto nbEnv = c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING");
+
+  // 1st priority: Respect the user's setting
+  if (options_->config.blocking != NCCL_CONFIG_UNDEF_INT) {
+    useNonblocking_ = options_->config.blocking == 0;
+  }
+  // 2nd priority: Respect the environment variable
+  else if (nbEnv.has_value()) {
+    useNonblocking_ = nbEnv.value();
+  }
+  // 3rd priority: automatically use nonblocking if we are in eager init mode
+  else if (getBoundDeviceId()) {
+    useNonblocking_ = true;
+  }
+  // 4th priority: otherwise, nonblocking = false to preserve old behavior
+  else {
+    useNonblocking_ = false;
+  }
+
+  LOG(INFO) << logPrefix()
+            << "Using non-blocking mode: " << useNonblocking_.value();
+  return useNonblocking_.value();
+}
+
 void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   // If our backend doesn't support splitting, this is a no-op for
   // ranks not in the new subgroup (and ranks that would be in it will
@@ -1067,6 +1099,8 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
   const auto key = getKeyFromDevice(device);
   LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
             << device << ", key " << key << ", i am " << this;
+  bool useNb = useNonblocking();
+  options_->config.blocking = useNb ? 0 : 1;
   auto comm = getNCCLComm(key, device, OpType::ALLREDUCE);
   NCCLComm::split(
       comm.get(),
@@ -2357,6 +2391,11 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
     rank = p2pRank;
   }
 
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  bool useNb = useNonblocking();
+  options_->config.blocking = useNb ? 0 : 1;
+#endif
+
 #ifdef NCCL_HAS_COMM_SPLIT
   if (options_->split_from) {
     // Find a valid, healthy communicator to split from if possible.
@@ -2773,7 +2812,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     work->ncclStartEvent_->record(ncclStream);
   }
 
-  if (nccl_use_nonblocking()) {
+  if (useNonblocking()) {
     groupEndNonblocking(comm);
   } else {
     groupEnd();
@@ -3093,8 +3132,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
 #endif
 
   {
-    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(
-        comm, nccl_use_nonblocking());
+    torch::cuda::nccl::AutoNcclGroup nccl_group_guard(comm, useNonblocking());
     for (const auto i : c10::irange(inputs.size())) {
       // Both `inputs' and `outputs' are created on a worker stream and used in
       // different ncclStreams.  Hence, both must record the ncclStream to
@@ -4662,7 +4700,7 @@ void ProcessGroupNCCL::groupEndNonblocking(
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
 #else
-  if (!nccl_use_nonblocking()) {
+  if (!useNonblocking()) {
     C10D_NCCL_CHECK(ncclGroupEnd(), std::nullopt);
   } else {
     C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comm, std::nullopt);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 5ec9ae32405f6..839463a9d8be1 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -778,6 +778,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Abort all communicators on this rank.
   bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
 
+  // A helper function to check if nonblocking API mode should be used.
+  // Use this helper instead of directly checking `useNonblocking_` variable.
+  bool useNonblocking();
+
  private:
   int globalRankStart;
   int globalRankStride;
@@ -1237,6 +1241,10 @@ class TORCH_API ProcessGroupNCCL : public Backend {
 
   std::shared_ptr<ProcessGroupStatus> pgStatus_ =
       std::make_shared<ProcessGroupStatus>();
+
+  // Internal cached value: use NCCL non-blocking API mode or not.
+  // Use `useNonblocking()` method instead of accessing this variable directly.
+  std::optional<bool> useNonblocking_{std::nullopt};
 };
 
 // Dumps the NCCL comm traces and additional information about the Process

From 652a2ab93e2f8bdd666ee9f314c41508b0edfc68 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <2453524+malfet@users.noreply.github.com>
Date: Sun, 27 Oct 2024 18:04:03 +0000
Subject: [PATCH 143/161] [BE] Skip `print(foo)` tests (#139009)

Skipped `test_exponential` and `test_multinomial` because simply printing the result of an operator does not constitute a test. The testing framework does not attempt to interpret the output.
Modify `test_print_non_contiguous` to get tensors string representation, which is an equivalent operation

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139009
Approved by: https://github.com/Skylion007
---
 test/test_mps.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 4540e154ccfa9..cf499603f809c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8123,6 +8123,7 @@ def helper(shape, low, high, dtype=torch.int32):
             self.assertNotEqual(x.max().item(), 0)
 
     # Test exponential
+    @unittest.skip("This does not test anything")
     def test_exponential(self):
         def helper(shape, lamda, dtype=torch.float32):
 
@@ -8326,6 +8327,7 @@ def helper(shape):
         helper(10000)
         helper((10000, 40))
 
+    @unittest.skip("This does not test anything")
     def test_multinomial(self):
         # Test with num_dist = 1
         def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
@@ -8855,8 +8857,10 @@ def test_permute(self):
 
     # Printing of non_contiguous should not crash
     def test_print_non_contiguous(self):
-        print(torch.ones(100, 100, device='mps').nonzero())
-        print(torch.ones(100, 100, device='mps').nonzero().contiguous())
+        # print(obj) is equivalent to calling `x=str(obj); print(x)`
+        # Use assertTrue in case to make sure non-empty string is returned
+        self.assertTrue(str(torch.ones(100, 100, device='mps').nonzero()))
+        self.assertTrue(str(torch.ones(100, 100, device='mps').nonzero().contiguous()))
 
     def test_zero_grad(self):
         i = torch.randn(2, 5, requires_grad=True)

From beb15c80fb38ea54de4148bc8987db7751f060d7 Mon Sep 17 00:00:00 2001
From: Xu Han <xu.han@outlook.com>
Date: Sun, 27 Oct 2024 18:08:28 +0000
Subject: [PATCH 144/161] print USE_STATIC_MKL for further debug. (#138902)

print `USE_STATIC_MKL` for further debug.
<img width="257" alt="image" src="https://github.com/user-attachments/assets/cd45bada-c28a-441a-b271-35956cfe1f21">
if we use `MKL`, then show its link method.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138902
Approved by: https://github.com/ezyang
---
 cmake/Summary.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3f70465c91d6d..8e245262e84ea 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -143,6 +143,9 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
   message(STATUS "  USE_MPS               : ${USE_MPS}")
   message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
+  if(${CAFFE2_USE_MKL})
+    message(STATUS "    USE_STATIC_MKL      : ${USE_STATIC_MKL}")
+  endif()
   message(STATUS "  USE_MKLDNN            : ${USE_MKLDNN}")
   if(${USE_MKLDNN})
     message(STATUS "  USE_MKLDNN_ACL        : ${USE_MKLDNN_ACL}")

From 192385e261eec0770822b76b0c24055fd1f70fc5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Thu, 24 Oct 2024 17:01:53 -0400
Subject: [PATCH 145/161] Add sym_sum to TorchInGraphFunctionVariable (#138848)

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138848
Approved by: https://github.com/Skylion007
---
 torch/_dynamo/trace_rules.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 7a8d25d98c4d4..2f91f04453950 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -189,6 +189,7 @@
     "torch.sym_min": TorchInGraphFunctionVariable,
     "torch.sym_sqrt": TorchInGraphFunctionVariable,
     "torch.sym_ite": TorchInGraphFunctionVariable,
+    "torch.sym_sum": TorchInGraphFunctionVariable,
     "torch.Tensor#_make_wrapper_subclass": SkipFunctionVariable,
     "torch.Tensor#__init__": SkipFunctionVariable,
     "torch.cuda.set_device": SkipFunctionVariable,

From d2ec28978730db514931e4dc146f8188738ed1ff Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sun, 27 Oct 2024 20:07:36 +0000
Subject: [PATCH 146/161] Turn header static function into inline (#138671)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138671
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/cpu/IndexKernelUtils.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cpu/IndexKernelUtils.h b/aten/src/ATen/native/cpu/IndexKernelUtils.h
index 876f759e130f2..c513d128e2342 100644
--- a/aten/src/ATen/native/cpu/IndexKernelUtils.h
+++ b/aten/src/ATen/native/cpu/IndexKernelUtils.h
@@ -4,8 +4,7 @@
 
 namespace at::native {
 
-namespace {
-static bool is_constant_index(int ntensor, const int64_t* strides) {
+inline bool is_constant_index(int ntensor, const int64_t* strides) {
   AT_ASSERT(ntensor >= 3);
   for (const auto arg : c10::irange(2, ntensor)) {
     if (strides[arg] != 0) {
@@ -49,7 +48,6 @@ struct Indexer {
     return offset;
   }
 };
-} // anonymous namespace
 
 template <typename scalar_t, typename func_t>
 void cpu_index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride,

From 7cb3cef05f4b1d1b448a82a01420e2a9ed1ccfe0 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Mon, 28 Oct 2024 01:38:02 +0000
Subject: [PATCH 147/161]  [3/N] Fix cppcoreguidelines-special-member-functions
 warnings (#138796)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138796
Approved by: https://github.com/ezyang
---
 aten/src/ATen/TensorIterator.cpp                      | 2 --
 aten/src/ATen/TensorIterator.h                        | 6 +++++-
 aten/src/ATen/code_template.h                         | 3 +++
 aten/src/ATen/core/Array.h                            | 6 ++++++
 aten/src/ATen/core/Dict.h                             | 1 +
 aten/src/ATen/core/Formatting.cpp                     | 8 ++++++--
 aten/src/ATen/core/List.h                             | 4 ++++
 aten/src/ATen/core/List_test.cpp                      | 4 ++--
 aten/src/ATen/core/NamedTensor.h                      | 4 ++++
 aten/src/ATen/core/PythonFallbackKernel.cpp           | 4 ++++
 aten/src/ATen/core/function_schema.h                  | 1 +
 aten/src/ATen/core/ivalue.h                           | 5 +++++
 torch/csrc/cuda/CUDAPluggableAllocator.h              | 6 +++++-
 torch/csrc/profiler/collection.h                      | 1 +
 torch/csrc/profiler/perf.h                            | 2 ++
 torch/csrc/profiler/standalone/privateuse1_observer.h | 4 ++++
 torch/csrc/utils/object_ptr.h                         | 2 ++
 torch/csrc/utils/throughput_benchmark.h               | 1 +
 18 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 19d12769fb80d..c151c8d7731b7 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -1483,8 +1483,6 @@ FastSetupType TensorIteratorBase::compute_fast_setup_type(const TensorIteratorCo
   return FastSetupType::NONE;
 }
 
-TensorIteratorBase::TensorIteratorBase() = default;
-
 void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // populate some persistent configuration fields
   is_reduction_ = config.is_reduction_;
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index e9e9e0c8e8bfe..471faf664e271 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -250,7 +250,6 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;
 
-  TensorIteratorBase();
   void build(TensorIteratorConfig&);
 
   // The inner-loop function operates on the fastest moving dimension. It
@@ -788,6 +787,9 @@ class TORCH_API TensorIteratorConfig final {
   TensorIteratorConfig() = default;
 
   C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig);
+  TensorIteratorConfig(TensorIteratorConfig&&) = default;
+  TensorIteratorConfig& operator=(TensorIteratorConfig&&) = default;
+  ~TensorIteratorConfig() = default;
 
   /// Construction
   // Stores input/output Tensors without incrementing the reference count.
@@ -997,6 +999,8 @@ struct TORCH_API SplitUntil32Bit {
     iterator() = default;
     iterator(const TensorIteratorBase& iter);
     iterator(iterator&&) = default;
+    iterator& operator=(iterator&&) = default;
+    ~iterator() = default;
 
     // Guaranteed to be a TensorIterator proper!
     TensorIterator& operator*() const;
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index ee7488b4e348c..2026795fc0a3d 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -19,7 +19,10 @@ namespace at::jit {
 struct TemplateEnv {
   TemplateEnv() = default;
   TemplateEnv(TemplateEnv& parent) : parent(&parent) {}
+  TemplateEnv(TemplateEnv&&) = delete;
   TemplateEnv& operator=(const TemplateEnv& parent) = delete;
+  TemplateEnv& operator=(TemplateEnv&& parent) = delete;
+  ~TemplateEnv() = default;
 
   using string_list = std::vector<std::string>;
 
diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h
index 8372fe81c5c5a..945c4195c9aa5 100644
--- a/aten/src/ATen/core/Array.h
+++ b/aten/src/ATen/core/Array.h
@@ -23,10 +23,16 @@ struct Array {
   C10_HOST_DEVICE Array() = default;
   C10_HOST_DEVICE Array(const Array&) = default;
   C10_HOST_DEVICE Array& operator=(const Array&) = default;
+  C10_HOST_DEVICE Array(Array&&) = default;
+  C10_HOST_DEVICE Array& operator=(Array&&) = default;
+  C10_HOST_DEVICE ~Array() = default;
 #else
   Array() = default;
   Array(const Array&) = default;
   Array& operator=(const Array&) = default;
+  Array(Array&&) = default;
+  Array& operator=(Array&&) = default;
+  ~Array() = default;
 #endif
   static constexpr int size() {
     return size_;
diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index d88250fbdd08c..a1d4da07520fa 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -83,6 +83,7 @@ class DictEntryRef final {
     static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of setValue()");
     iterator_->second = Value(std::forward<Value_>(value));
   }
+  ~DictEntryRef() = default;
 
 private:
   // allow copying and moving, but only our friends (i.e. the Dict class) can do
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 22f1490fc4908..7762e543234ad 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -50,16 +50,20 @@ inline std::ios_base& defaultfloat(std::ios_base& __base) {
 //saves/restores number formatting inside scope
 struct FormatGuard {
   FormatGuard(std::ostream & out)
-  : out(out), saved(nullptr) {
+  : out(out) {
     saved.copyfmt(out);
   }
   ~FormatGuard() {
     out.copyfmt(saved);
   }
+  FormatGuard(const FormatGuard&) = delete;
+  FormatGuard(FormatGuard&&) = delete;
+  FormatGuard& operator=(const FormatGuard&) = delete;
+  FormatGuard& operator=(FormatGuard&&) = delete;
 private:
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   std::ostream & out;
-  std::ios saved;
+  std::ios saved{nullptr};
 };
 
 std::ostream& operator<<(std::ostream & out, const DeprecatedTypeProperties& t) {
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index 34cdd738b95f1..5431e1c8e31ba 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -88,6 +88,7 @@ class ListElementReference final {
 
   ListElementReference(const ListElementReference&) = delete;
   ListElementReference& operator=(const ListElementReference&) = delete;
+  ~ListElementReference() = default;
 
 private:
   ListElementReference(Iterator iter)
@@ -273,6 +274,9 @@ class List final {
 
   List(const List&) = default;
   List& operator=(const List&) = default;
+  List(List&&) = default;
+  List& operator=(List&&) = default;
+  ~List() = default;
 
   /**
    * Create a new List pointing to a deep copy of the same data.
diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index 1460891689e2d..77b4281d3627a 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -3,7 +3,7 @@
 
 using namespace c10;
 
-// NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move)
+// NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move)
 TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
     List<string> list;
     EXPECT_TRUE(list.empty());
@@ -1162,4 +1162,4 @@ TEST(ListTest, toTypedList) {
   genericList = impl::toList(std::move(stringList));
   EXPECT_THROW(c10::impl::toTypedList<int64_t>(std::move(genericList)), c10::Error);
 }
-// NOLINTEND(performance-move-const-arg, bugprone-use-after-move)
+// NOLINTEND(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move)
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index 02d226a01973d..81998e160185a 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -82,6 +82,10 @@ struct TORCH_API NoNamesGuard {
   NoNamesGuard() : prev_mode(NamesMode::is_enabled()) {
     NamesMode::set_enabled(false);
   }
+  NoNamesGuard(const NoNamesGuard&) = delete;
+  NoNamesGuard(NoNamesGuard&&) = delete;
+  NoNamesGuard& operator=(const NoNamesGuard&) = delete;
+  NoNamesGuard& operator=(NoNamesGuard&&) = delete;
   ~NoNamesGuard() {
     if (initialized) {
       reset();
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index ec8d3b2d10d52..efd9508ce15c2 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -35,6 +35,10 @@ struct StashTLSOnEntryGuard {
   StashTLSOnEntryGuard(): saved_(tls_on_entry.value()) {
     tls_on_entry = std::nullopt;
   }
+  StashTLSOnEntryGuard(const StashTLSOnEntryGuard&) = delete;
+  StashTLSOnEntryGuard(StashTLSOnEntryGuard&&) = delete;
+  StashTLSOnEntryGuard& operator=(const StashTLSOnEntryGuard&) = delete;
+  StashTLSOnEntryGuard& operator=(StashTLSOnEntryGuard&&) = delete;
 
   ~StashTLSOnEntryGuard() {
     TORCH_INTERNAL_ASSERT(!tls_on_entry.has_value());
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 081e38e49b867..ce550f03e2cec 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -82,6 +82,7 @@ struct TORCH_API Argument {
     }
     return *this;
   }
+  ~Argument() = default;
 
   const std::string& name() const {
     return name_;
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 42a03ea946027..514aa6eb55187 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -1352,8 +1352,13 @@ struct TORCH_API IValue final {
         DeviceIndex index;
       } as_device;
     } u;
+    static_assert(std::is_trivially_copyable_v<TriviallyCopyablePayload>);
     at::Tensor as_tensor;
     Payload() : u() {}
+    Payload(const Payload&) = delete;
+    Payload(Payload&&) = delete;
+    Payload& operator=(const Payload&) = delete;
+    Payload& operator=(Payload&&) = delete;
     ~Payload() {}
   };
 
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index e98909c4897ce..5ee7b6824d1b7 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -73,7 +73,11 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
       std::function<FreeFuncType> free_fn);
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
-  CUDAPluggableAllocator& operator=(CUDAPluggableAllocator& other) = delete;
+  CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
+  CUDAPluggableAllocator& operator=(const CUDAPluggableAllocator& other) =
+      delete;
+  CUDAPluggableAllocator& operator=(CUDAPluggableAllocator&& other) = delete;
+  ~CUDAPluggableAllocator() override = default;
 
   void set_init_fn(std::function<void(int)> init_fn);
 
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index abaa9a845082b..0b5bad4d2b495 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -57,6 +57,7 @@ struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
   RawTensorMetadata(RawTensorMetadata&&) noexcept = default;
   RawTensorMetadata& operator=(const RawTensorMetadata&) = default;
   RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default;
+  ~RawTensorMetadata() = default;
   explicit RawTensorMetadata(const at::Tensor& t);
 
   // Wrap `weak_self_` in `std::optional` and split device into components to
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
index 8257f86c7098c..07ff1211dbf91 100644
--- a/torch/csrc/profiler/perf.h
+++ b/torch/csrc/profiler/perf.h
@@ -37,6 +37,8 @@ class PerfEvent {
  public:
   explicit PerfEvent(std::string& name) : name_(name) {}
 
+  PerfEvent(const PerfEvent& other) = delete;
+  PerfEvent& operator=(const PerfEvent&) = delete;
   PerfEvent& operator=(PerfEvent&& other) noexcept {
     if (this != &other) {
       fd_ = other.fd_;
diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.h b/torch/csrc/profiler/standalone/privateuse1_observer.h
index 62b431aabc8b4..48b77d3daae28 100644
--- a/torch/csrc/profiler/standalone/privateuse1_observer.h
+++ b/torch/csrc/profiler/standalone/privateuse1_observer.h
@@ -12,6 +12,10 @@ struct PushPRIVATEUSE1CallbacksStub {
   PushPRIVATEUSE1CallbacksStub(const PushPRIVATEUSE1CallbacksStub&) = delete;
   PushPRIVATEUSE1CallbacksStub& operator=(const PushPRIVATEUSE1CallbacksStub&) =
       delete;
+  PushPRIVATEUSE1CallbacksStub(PushPRIVATEUSE1CallbacksStub&&) = default;
+  PushPRIVATEUSE1CallbacksStub& operator=(PushPRIVATEUSE1CallbacksStub&&) =
+      default;
+  ~PushPRIVATEUSE1CallbacksStub() = default;
 
   template <typename... ArgTypes>
   void operator()(ArgTypes&&... args) {
diff --git a/torch/csrc/utils/object_ptr.h b/torch/csrc/utils/object_ptr.h
index 81ad207306844..70a887860ffa7 100644
--- a/torch/csrc/utils/object_ptr.h
+++ b/torch/csrc/utils/object_ptr.h
@@ -10,6 +10,8 @@ class TORCH_PYTHON_API THPPointer {
   THPPointer() : ptr(nullptr){};
   explicit THPPointer(T* ptr) noexcept : ptr(ptr){};
   THPPointer(THPPointer&& p) noexcept : ptr(std::exchange(p.ptr, nullptr)) {}
+  THPPointer(const THPPointer& p) = delete;
+  THPPointer& operator=(const THPPointer&) = delete;
 
   ~THPPointer() {
     free();
diff --git a/torch/csrc/utils/throughput_benchmark.h b/torch/csrc/utils/throughput_benchmark.h
index e10ca0649fd15..50854f1b73aa0 100644
--- a/torch/csrc/utils/throughput_benchmark.h
+++ b/torch/csrc/utils/throughput_benchmark.h
@@ -103,6 +103,7 @@ struct C10_HIDDEN ModuleInput {
   ModuleInput(const ModuleInput&) = delete;
   ModuleInput& operator=(ModuleInput& other) = delete;
   ModuleInput& operator=(ModuleInput&& other) = delete;
+  ~ModuleInput() = default;
 
   ModuleInput(py::args&& args, py::kwargs&& kwargs)
       : args(std::move(args)), kwargs(std::move(kwargs)) {}

From c056dc4cb8fbe07902325df0eef0ede24a22b83f Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Sun, 27 Oct 2024 11:15:24 -0700
Subject: [PATCH 148/161] In Inductor, be willing to generate deferred runtime
 asserts when unbacked (#138804)

Title + we avoid calling defer_assert when we statically know the guard results.
timing for pnasnet5large

```
TIMING: code_gen:21.79672 inductor_compile:39.57726 backend_compile:65.30649 entire_frame_compile:95.22052 total_wall_time:95.22052
```
matches with out the diff
```
TIMING: code_gen:21.89314 inductor_compile:39.72298 backend_compile:65.38539 entire_frame_compile:95.0854 total_wall_time:95.0854
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138804
Approved by: https://github.com/ezyang
---
 .../pr_time_benchmarks/expected_results.csv   |  2 +-
 torch/_inductor/sizevars.py                   | 27 +++++++++++++++----
 torch/fx/experimental/symbolic_shapes.py      |  7 +++--
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index 1605327050975..5ff4702a7b0bb 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -35,7 +35,7 @@ basic_modules_ListOfLinears_inductor_gpu,                  compile_time_instruct
 
 
 
-update_hint_regression,  compile_time_instruction_count,  1853008305,    0.02
+update_hint_regression,  compile_time_instruction_count,  1795333141,    0.02
 
 
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 44fe34895a8cd..8775036cf1059 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -247,9 +247,11 @@ def _simplify_loops_impl(
             # for which "strides" don't make sense so we ignore them here.
             # NOTE: These expressions may still block merging dims in the sound
             # substitution test performed in can_merge_dims.
-            self.stride_vars(x, index_vars)
-            if isinstance(x, sympy.Expr)
-            else [0] * len(index_vars)
+            (
+                self.stride_vars(x, index_vars)
+                if isinstance(x, sympy.Expr)
+                else [0] * len(index_vars)
+            )
             for x in index_formulas
         ]
         assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
@@ -415,14 +417,29 @@ def guard_equals(self, left: Expr, right: Expr) -> Expr:
             left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
         if isinstance(right, Expr):
             right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
-        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
+
+        expr = sympy.Eq(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return left
+
+        assert self.shape_env.defer_runtime_assert(expr, "guard_equals")
         return left
 
     def guard_leq(self, left: Expr, right: Expr) -> None:
         return self.guard_lt(left, right + 1)
 
     def guard_lt(self, left: Expr, right: Expr) -> None:
-        assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
+        expr = sympy.Lt(left, right)
+        static_expr = self.shape_env._maybe_evaluate_static(expr)
+
+        if static_expr is not None:
+            assert bool(static_expr)
+            return
+
+        assert self.shape_env.defer_runtime_assert(expr, "guard_lt")
 
     def guarded_order(self, seq):
         """
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index d5503ba25acb3..83c651e29c585 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -6289,6 +6289,7 @@ def cleanup(self) -> None:
             for ra in ras:
                 ra.stack.cleanup()
 
+    @lru_cache(256)
     @record_shapeenv_event(save_tracked_fakes=True)
     def defer_runtime_assert(
         self, orig_expr: SympyBoolean, msg: str, fx_node: Optional[torch.fx.Node] = None
@@ -6326,7 +6327,6 @@ def defer_runtime_assert(
         # NB: Don't use new_expr as expr; it could contain gunk like shape0
         # which we don't want to guard on
 
-        # OK, we're definitely doing a runtime assert now
         if (
             self._translation_validation_enabled
             and fx_node is not None
@@ -6340,10 +6340,9 @@ def defer_runtime_assert(
         if not self._suppress_guards_tls():
             # If you're here because of this assert, read Note [Backwards runtime asserts]
             # in torch/_inductor/graph.py
-            assert not self.runtime_asserts_frozen, expr
-
+            if self.runtime_asserts_frozen:
+                log.warning("runtime_asserts_frozen but then got %s", expr)
             self._check_frozen(expr, sympy.true)
-
             # eliminate symbols on equality tests / refine ranges
             if isinstance(expr, sympy.Rel):
                 self._maybe_guard_rel(expr)

From 5d450d7facd7480482132408acc4c23d80933bab Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@meta.com>
Date: Sun, 27 Oct 2024 13:51:00 -0400
Subject: [PATCH 149/161] Add sym_log2 (#137980)

Internal xref: https://fb.workplace.com/groups/1075192433118967/permalink/1515595595745313/

Signed-off-by: Edward Z. Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/137980
Approved by: https://github.com/bobrenjc93
---
 test/test_dynamic_shapes.py        | 15 ++++-
 torch/__init__.py                  |  3 +
 torch/fx/experimental/sym_node.py  | 91 ++++++++++++++++--------------
 torch/utils/_sympy/functions.py    |  3 +
 torch/utils/_sympy/interp.py       |  5 ++
 torch/utils/_sympy/reference.py    | 13 +++++
 torch/utils/_sympy/value_ranges.py |  8 +++
 7 files changed, 93 insertions(+), 45 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 58f240e507544..0ec0d937d8dee 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -499,6 +499,16 @@ def test_sym_int(self):
             str(shape_env.guards[2][0]), """Eq(TruncToInt(2.0*ToFloat(s2)), 6)"""
         )
 
+    def test_sym_log2(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 4)
+        r = torch._sym_log2(a0)
+        self.assertEqual(r, 2.0)
+        self.assertIsInstance(r, torch.SymFloat, msg=type(r))
+        self.assertExpectedInline(
+            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_log2(ToFloat(s0)), 2.0)"""
+        )
+
     def test_sym_sqrt(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 4)
@@ -506,7 +516,7 @@ def test_sym_sqrt(self):
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
         self.assertExpectedInline(
-            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_sqrt(s0), 2.0)"""
+            str(shape_env.guards[0][0]), """Eq(OpaqueUnaryFn_sqrt(ToFloat(s0)), 2.0)"""
         )
 
     def test_sym_floor(self):
@@ -540,7 +550,8 @@ def test_sym_trunc(self):
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
         self.assertExpectedInline(
-            str(shape_env.guards[1][0]), """Eq(TruncToInt(OpaqueUnaryFn_sqrt(s0)), 2)"""
+            str(shape_env.guards[1][0]),
+            """Eq(TruncToInt(OpaqueUnaryFn_sqrt(ToFloat(s0))), 2)""",
         )
 
     def test_sym_ceil(self):
diff --git a/torch/__init__.py b/torch/__init__.py
index 144af1f508eef..6c325d17c124c 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -876,6 +876,8 @@ def _get_sym_math_fn(name):
     def fn(a):
         if overrides.has_torch_function_unary(a):
             return overrides.handle_torch_function(fn, (a,), a)
+        if isinstance(a, SymInt):
+            a = torch.sym_float(a)
         if hasattr(a, f"__sym_{name}__"):
             return getattr(a, f"__sym_{name}__")()
         return getattr(math, name)(a)
@@ -895,6 +897,7 @@ def fn(a):
     "asin",
     "acos",
     "atan",
+    "log2",
 ):
     __sym_name = f"_sym_{__name}"
     __fn = _get_sym_math_fn(__name)
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index c30cab7431c48..9c59973621bf9 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -1,4 +1,8 @@
 # mypy: allow-untyped-defs
+
+from __future__ import annotations
+
+
 """
 This file does three things:
 - Contains the definition of SymNode
@@ -145,12 +149,12 @@ def compute_hint():
         )
         self.fx_node = tx_validation_en and fx_node
 
-    def with_shape_env(self, shape_env: "ShapeEnv") -> "SymNode":
+    def with_shape_env(self, shape_env: ShapeEnv) -> SymNode:
         return SymNode(
             self._expr, shape_env, self.pytype, self._hint, self.constant, self.fx_node
         )
 
-    def _value_eq(self, other: "SymNode") -> bool:
+    def _value_eq(self, other: SymNode) -> bool:
         # Purposely don't include the shape_env in the eq.
         return (
             self._expr == other._expr
@@ -281,121 +285,121 @@ def _graph_repr(self) -> builtins.str:
 
     # These methods call the metaprogrammed methods, they're hand written
     # here so we get good stack traces
-    def abs(self) -> "SymNode":
+    def abs(self) -> SymNode:
         return self._abs()  # type: ignore[attr-defined]
 
-    def pos(self) -> "SymNode":
+    def pos(self) -> SymNode:
         return self._pos()  # type: ignore[attr-defined]
 
-    def round(self, ndigits=None) -> "SymNode":
+    def round(self, ndigits=None) -> SymNode:
         return self._round(ndigits)  # type: ignore[attr-defined]
 
-    def trunc(self) -> "SymNode":
+    def trunc(self) -> SymNode:
         return self._trunc()  # type: ignore[attr-defined]
 
-    def add(self, other) -> "SymNode":
+    def add(self, other) -> SymNode:
         return self._add(other)  # type: ignore[attr-defined]
 
-    def sub(self, other) -> "SymNode":
+    def sub(self, other) -> SymNode:
         return self._sub(other)  # type: ignore[attr-defined]
 
-    def mul(self, other) -> "SymNode":
+    def mul(self, other) -> SymNode:
         return self._mul(other)  # type: ignore[attr-defined]
 
-    def mod(self, other) -> "SymNode":
+    def mod(self, other) -> SymNode:
         return self._mod(other)  # type: ignore[attr-defined]
 
-    def float_pow(self, other) -> "SymNode":
+    def float_pow(self, other) -> SymNode:
         return self._float_pow(other)  # type: ignore[attr-defined]
 
-    def pow_by_natural(self, other) -> "SymNode":
+    def pow_by_natural(self, other) -> SymNode:
         return self._pow_by_natural(other)  # type: ignore[attr-defined]
 
-    def and_(self, other) -> "SymNode":
+    def and_(self, other) -> SymNode:
         return self._and_(other)  # type: ignore[attr-defined]
 
-    def or_(self, other) -> "SymNode":
+    def or_(self, other) -> SymNode:
         return self._or_(other)  # type: ignore[attr-defined]
 
-    def float_truediv(self, other) -> "SymNode":
+    def float_truediv(self, other) -> SymNode:
         return self._float_truediv(other)  # type: ignore[attr-defined]
 
-    def int_truediv(self, other) -> "SymNode":
+    def int_truediv(self, other) -> SymNode:
         return self._int_truediv(other)  # type: ignore[attr-defined]
 
-    def int_floordiv(self, other) -> "SymNode":
+    def int_floordiv(self, other) -> SymNode:
         return self._int_floordiv(other)  # type: ignore[attr-defined]
 
-    def lshift(self, other) -> "SymNode":
+    def lshift(self, other) -> SymNode:
         return self._lshift(other)  # type: ignore[attr-defined]
 
-    def rshift(self, other) -> "SymNode":
+    def rshift(self, other) -> SymNode:
         return self._rshift(other)  # type: ignore[attr-defined]
 
-    def sym_not(self) -> "SymNode":  # noqa: F811
+    def sym_not(self) -> SymNode:  # noqa: F811
         return self._sym_not()  # type: ignore[attr-defined]
 
-    def eq(self, other) -> "SymNode":
+    def eq(self, other) -> SymNode:
         return self._eq(other)  # type: ignore[attr-defined]
 
-    def ne(self, other) -> "SymNode":
+    def ne(self, other) -> SymNode:
         return self._ne(other)  # type: ignore[attr-defined]
 
-    def gt(self, other) -> "SymNode":
+    def gt(self, other) -> SymNode:
         return self._gt(other)  # type: ignore[attr-defined]
 
-    def lt(self, other) -> "SymNode":
+    def lt(self, other) -> SymNode:
         return self._lt(other)  # type: ignore[attr-defined]
 
-    def le(self, other) -> "SymNode":
+    def le(self, other) -> SymNode:
         return self._le(other)  # type: ignore[attr-defined]
 
-    def ge(self, other) -> "SymNode":
+    def ge(self, other) -> SymNode:
         return self._ge(other)  # type: ignore[attr-defined]
 
-    def floor(self) -> "SymNode":
+    def floor(self) -> SymNode:
         return self._floor()  # type: ignore[attr-defined]
 
-    def is_integer(self) -> "SymNode":
+    def is_integer(self) -> SymNode:
         return self._is_integer()  # type: ignore[attr-defined]
 
-    def sym_float(self) -> "SymNode":  # noqa: F811
+    def sym_float(self) -> SymNode:  # noqa: F811
         return self._sym_float()  # type: ignore[attr-defined]
 
-    def sym_int(self) -> "SymNode":
+    def sym_int(self) -> SymNode:
         return self._sym_int()  # type: ignore[attr-defined]
 
-    def ceil(self) -> "SymNode":
+    def ceil(self) -> SymNode:
         return self._ceil()  # type: ignore[attr-defined]
 
-    def neg(self) -> "SymNode":
+    def neg(self) -> SymNode:
         return self._neg()  # type: ignore[attr-defined]
 
-    def sym_min(self, other) -> "SymNode":  # noqa: F811
+    def sym_min(self, other) -> SymNode:  # noqa: F811
         return self._sym_min(other)  # type: ignore[attr-defined]
 
-    def sym_max(self, other) -> "SymNode":  # noqa: F811
+    def sym_max(self, other) -> SymNode:  # noqa: F811
         return self._sym_max(other)  # type: ignore[attr-defined]
 
-    def sym_ite(self, then_val, else_val) -> "SymNode":
+    def sym_ite(self, then_val, else_val) -> SymNode:
         return self._sym_ite(then_val, else_val)  # type: ignore[attr-defined]
 
-    def is_contiguous(self, sizes, strides) -> "SymNode":
+    def is_contiguous(self, sizes, strides) -> SymNode:
         return self._is_contiguous(sizes, strides)  # type: ignore[attr-defined]
 
-    def is_channels_last_contiguous_2d(self, sizes, strides) -> "SymNode":
+    def is_channels_last_contiguous_2d(self, sizes, strides) -> SymNode:
         return self._is_channels_last_contiguous_2d(sizes, strides)  # type: ignore[attr-defined]
 
-    def is_channels_last_contiguous_3d(self, sizes, strides) -> "SymNode":
+    def is_channels_last_contiguous_3d(self, sizes, strides) -> SymNode:
         return self._is_channels_last_contiguous_3d(sizes, strides)  # type: ignore[attr-defined]
 
-    def is_channels_last_strides_2d(self, sizes, strides) -> "SymNode":
+    def is_channels_last_strides_2d(self, sizes, strides) -> SymNode:
         return self._is_channels_last_strides_2d(sizes, strides)  # type: ignore[attr-defined]
 
-    def is_channels_last_strides_3d(self, sizes, strides) -> "SymNode":
+    def is_channels_last_strides_3d(self, sizes, strides) -> SymNode:
         return self._is_channels_last_strides_3d(sizes, strides)  # type: ignore[attr-defined]
 
-    def is_non_overlapping_and_dense_indicator(self, sizes, strides) -> "SymNode":
+    def is_non_overlapping_and_dense_indicator(self, sizes, strides) -> SymNode:
         return self._is_non_overlapping_and_dense_indicator(sizes, strides)  # type: ignore[attr-defined]
 
     # Make C++ happy
@@ -409,7 +413,7 @@ def sym_and(self, other):
     def truediv(self, other):
         return self.float_truediv(other)
 
-    def floordiv(self, other) -> "SymNode":
+    def floordiv(self, other) -> SymNode:
         return self.int_floordiv(other)
 
     # We didn't bind integer pow in C++
@@ -629,6 +633,7 @@ def fn(self):
     "asin",
     "acos",
     "atan",
+    "log2",
 )
 for name in math_op_names:
     sym_name = f"sym_{name}"
@@ -656,7 +661,7 @@ def fn(self):
 bool_magic_methods = only_bool_magic_methods | also_bool_magic_methods
 
 # Methods that are only for float
-only_float_magic_methods = {"is_integer", "round", "sym_int"}
+only_float_magic_methods = {"is_integer", "round", "sym_int", "sym_log2"}
 
 
 magic_methods_on_operator_with_trailing_underscore = {"and", "or"}
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index e3e248a008328..b369673d9213e 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -1197,6 +1197,8 @@ def eval(cls, a):
                     a = sympy.oo
                 if a is -int_oo:
                     a = -sympy.oo
+                if name == "log2":
+                    return sympy.log(a, 2)
                 return getattr(sympy, name)(a)
             return None
 
@@ -1219,3 +1221,4 @@ def eval(cls, a):
 OpaqueUnaryFn_exp = make_opaque_unary_fn("exp")
 OpaqueUnaryFn_log = make_opaque_unary_fn("log")
 OpaqueUnaryFn_asinh = make_opaque_unary_fn("asinh")
+OpaqueUnaryFn_log2 = make_opaque_unary_fn("log2")
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index eb03e0697cda2..3d26fa861b1e8 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -31,6 +31,7 @@
     Min,
     Mod,
     ModularIndexing,
+    OpaqueUnaryFn_log2,
     PowByNatural,
     PythonMod,
     RoundDecimal,
@@ -101,7 +102,11 @@ def handlers():
         Identity: "identity",
         IsNonOverlappingAndDenseIndicator: "is_non_overlapping_and_dense_indicator",
         RoundDecimal: "round_decimal",
+        # TODO: do the rest of the opaque unary functions...
+        OpaqueUnaryFn_log2: "log2",
     }
+    # TODO: This is kind of pointless, we shouldn't be generating sympy.sin
+    # for these functions, they should be Opaque instead
     for name in ["cos", "sin", "tan", "sinh", "cosh", "tanh", "asin", "acos", "atan"]:
         HANDLERS[getattr(sympy, name)] = name
 
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
index 4e2835fceaf30..3798fe3ea1354 100644
--- a/torch/utils/_sympy/reference.py
+++ b/torch/utils/_sympy/reference.py
@@ -17,6 +17,7 @@
     Mod,
     OpaqueUnaryFn_exp,
     OpaqueUnaryFn_log,
+    OpaqueUnaryFn_log2,
     OpaqueUnaryFn_sqrt,
     PowByNatural,
     RoundDecimal,
@@ -162,6 +163,10 @@ def exp(x):
     def log(x):
         return OpaqueUnaryFn_log(x)
 
+    @staticmethod
+    def log2(x):
+        return OpaqueUnaryFn_log2(x)
+
     @staticmethod
     def sqrt(x):
         return OpaqueUnaryFn_sqrt(x)
@@ -247,6 +252,10 @@ def exp(x):
     def log(x):
         raise AssertionError("log is not valid shape sympy expr")
 
+    @staticmethod
+    def log2(x):
+        return torch._sym_log2(x)  # type: ignore[attr-defined]
+
     @staticmethod
     def sqrt(x):
         return torch._sym_sqrt(x)  # type: ignore[attr-defined]
@@ -472,6 +481,10 @@ def exp(x):
     def log(x):
         return torch.ops.aten.log.default(x)
 
+    @staticmethod
+    def log2(x):
+        return torch.ops.aten.log2.default(x)
+
     @staticmethod
     def sqrt(x):
         return torch.ops.aten.sqrt.default(x)
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index c9dad120dc935..38cb27ebd40b0 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -34,6 +34,7 @@
     IntTrueDiv,
     OpaqueUnaryFn_exp,
     OpaqueUnaryFn_log,
+    OpaqueUnaryFn_log2,
     OpaqueUnaryFn_sqrt,
     PowByNatural,
     RoundDecimal,
@@ -760,6 +761,13 @@ def log(x):
             return ValueRanges.unknown()
         return ValueRanges.increasing_map(x, OpaqueUnaryFn_log)
 
+    @staticmethod
+    def log2(x):
+        x = ValueRanges.wrap(x)
+        if x.lower <= 0:
+            return ValueRanges.unknown()
+        return ValueRanges.increasing_map(x, OpaqueUnaryFn_log2)
+
     @classmethod
     def minimum(cls, a, b):
         return cls.min_or_max(a, b, sympy.Min)

From f7dc13806e56ebb14f24c558aba7d5379f2b27e1 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 28 Oct 2024 03:35:57 +0000
Subject: [PATCH 150/161] [2/N] Don't skip ASAN on some tests (#138663)

Follows #138571
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138663
Approved by: https://github.com/ezyang
---
 test/test_dataloader.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 8556ef328eb4c..2c2ddb85203c0 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3132,10 +3132,6 @@ def __getitem__(self, idx):
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
     "fork is not supported. Dying (set die_after_fork=0 to override)",
 )
-@unittest.skipIf(
-    TEST_WITH_ASAN,
-    "DataLoader tests hang in ASAN, see: https://github.com/pytorch/pytorch/issues/66223",
-)
 class TestDataLoaderPersistentWorkers(TestDataLoader):
     def setUp(self):
         super().setUp()
@@ -3407,10 +3403,6 @@ def __len__(self):
     "Fails with TSAN with the following error: starting new threads after multi-threaded "
     "fork is not supported. Dying (set die_after_fork=0 to override)",
 )
-@unittest.skipIf(
-    TEST_WITH_ASAN,
-    "Flaky with ASAN, see https://github.com/pytorch/pytorch/issues/65727",
-)
 class TestIndividualWorkerQueue(TestCase):
     def setUp(self):
         super().setUp()
@@ -3503,10 +3495,6 @@ def __getitem__(self, index):
 
 
 @unittest.skipIf(IS_WINDOWS, "Needs fork")
-@unittest.skipIf(
-    TEST_WITH_ASAN,
-    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492",
-)
 class TestConvAfterFork(TestCase):
     # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
     def test_conv_after_fork(self):

From d72241d0452811696e6c086c3a2e9b9204749dac Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Mon, 28 Oct 2024 03:36:42 +0000
Subject: [PATCH 151/161] [Ez][BE]: Fix one more incorrect TypeIs (#139010)

One other case where the side conditions could cause inaccurate typing info. Follow up to #138990

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139010
Approved by: https://github.com/malfet
---
 torch/_subclasses/fake_tensor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 3a7677320c7c3..df2fef74e127e 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -32,7 +32,7 @@
     TypeVar,
     Union,
 )
-from typing_extensions import Self, TypeGuard, TypeIs
+from typing_extensions import Self, TypeGuard
 from weakref import ReferenceType
 
 import torch
@@ -170,7 +170,7 @@ def get_plain_tensors(
     return plain_tensors
 
 
-def is_fake(x: object) -> TypeIs[Tensor]:
+def is_fake(x: object) -> TypeGuard[Tensor]:
     if isinstance(x, FakeTensor):
         return True
     if is_traceable_wrapper_subclass(x):

From 6f5d538972b029774eb16cc16f11772b8f3b7521 Mon Sep 17 00:00:00 2001
From: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
Date: Mon, 28 Oct 2024 03:43:56 +0000
Subject: [PATCH 152/161] [executorch hash update] update the pinned executorch
 hash (#138661)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned executorch hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138661
Approved by: https://github.com/pytorchbot
---
 .ci/docker/ci_commit_pins/executorch.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 9aaea8851d475..04f5876917a55 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-export-D64151426
+16b633b4daa7f3d3442be62a3589bd60b2f7fdc7

From 1fad37a02344cc4fd4f84c34674c11b39ba0253b Mon Sep 17 00:00:00 2001
From: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
Date: Mon, 28 Oct 2024 04:04:25 +0000
Subject: [PATCH 153/161] [audio hash update] update the pinned audio hash
 (#138402)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138402
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/audio.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 3789810cfb5ab..19c6feee63c74 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-79047bf6bdec9e32c4cffd0f9835b347781fefbf
+fa44bdab1fe49bab58389e7b6a33061ffced9bc7

From 4c6ae39afd179e4caff53679a35610771bfca433 Mon Sep 17 00:00:00 2001
From: Bob Ren <bobren@fb.com>
Date: Sun, 27 Oct 2024 14:58:16 -0700
Subject: [PATCH 154/161] Fix some nits in symbolic_shapes.py (#139018)

While I was reading through this file for understanding, I fixed some nits.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139018
Approved by: https://github.com/ezyang
---
 torch/fx/experimental/symbolic_shapes.py | 33 ++++++++++++------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 83c651e29c585..d7f8eb555cc5d 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -305,13 +305,6 @@ def uninteresting_files() -> Set[str]:
     return {inspect.getfile(m) for m in mods}
 
 
-# We don't bother with the metaclass as all of the dispatching logic happens
-# entirely from Python
-#
-# Didn't bother with ancestors for now, unlikely to have multiple modes for
-# symints right now
-
-
 class ConstraintViolationError(RuntimeError):
     pass
 
@@ -352,7 +345,8 @@ def has_hint(a: Scalar) -> bool:
 
 
 def is_concrete_int(a: Union[int, SymInt]) -> bool:
-    r"""Utility to check if underlying object
+    """
+    Utility to check if underlying object
     in SymInt is concrete value. Also returns
     true if integer is passed in.
 
@@ -525,7 +519,8 @@ def is_accessor_node(node: torch.fx.Node) -> bool:
 
 
 def canonicalize_bool_expr(expr: _T) -> _T:
-    r"""Canonicalize a boolean expression by transforming it into a lt / le
+    """
+    Canonicalize a boolean expression by transforming it into a lt / le
     inequality and moving all the non-constant terms to the rhs.
     We canonicalize And / Ors / Not via cnf and then canonicalize their subexpr
     recursively
@@ -677,9 +672,11 @@ def div_by_factor(x: sympy.Expr, factor: int) -> sympy.Expr:
 
 
 def is_concrete_bool(a: Union[bool, SymBool]) -> bool:
-    r"""Utility to check if underlying object
+    """
+    Utility to check if underlying object
     in SymBool is concrete value. Also returns
     true if integer is passed in.
+
     Args:
         a (SymBool or bool): Object to test if it bool
     """
@@ -1059,7 +1056,8 @@ def definitely_false(a: BoolLikeType) -> bool:
 
 
 def statically_known_true(x: Union[bool, SymBool]) -> bool:
-    """Returns True if x can be simplified to a constant and is true.
+    """
+    Returns True if x can be simplified to a constant and is true.
 
     .. note::
         This function doesn't introduce new guards, so the expression may end
@@ -1067,7 +1065,6 @@ def statically_known_true(x: Union[bool, SymBool]) -> bool:
 
     Args:
         x (bool, SymBool): The expression to try statically evaluating
-
     """
     if isinstance(x, SymBool):
         expr = x.node.expr
@@ -1481,7 +1478,8 @@ class EqualityConstraint(Constraint):
     _defs: Dict[Source, sympy.Expr] = field(init=False)
 
     def __post_init__(self) -> None:
-        """Pre-processing to answer queries `is_equal` and `is_derived` below.
+        """
+        Pre-processing to answer queries `is_equal` and `is_derived` below.
 
         Example: Suppose we are given:
           source_pairs [a = b, b = c]
@@ -3089,9 +3087,9 @@ def _init(
 
         # Whenever we allocate a fresh unbacked Symbol, we add it to this
         # pending list.  Unbacked symbol allocation can occur at unpredictable
-        # points during meta tensor propagation, but at some point, the we
+        # points during meta tensor propagation, but at some point, we
         # have to know what the binding site for an unbacked symbol is, and
-        # this is computed when we actually place the node in the graph.  The
+        # this is computed when we actually place the node in the graph. The
         # important thing is that we always actually handle every unaccounted
         # for unbacked symbol, so this list helps us keep track of them and
         # then make sure they are all accounted for.
@@ -4051,7 +4049,8 @@ def create_unspecified_symbol(
         dynamic_dim: DimDynamic = DimDynamic.DUCK,
         constraint_dim: DimConstraint = None,  # NB: includes None
     ) -> sympy.Expr:
-        """Create a symbol with an unspecified value
+        """
+        Create a symbol with an unspecified value
 
         Compared to standard symbols we do not assume the value is positive,
         nor do we specialze on zero or one values.
@@ -4471,7 +4470,7 @@ def _create_no_constraints_context(t: Tensor) -> StatelessSymbolicContext:
         #
         # So, it is perhaps easier to flip things on their head: the guard
         # expressions we generate here say what simplifications are valid,
-        # and what are not.  Below, we explain each of the guard expressions
+        # and what are not. Below, we explain each of the guard expressions
         # we generate
 
         # TODO: Make this more efficient by binding all the size/stride/offsets

From d2052ea84d8e8d52c1c8c32269851bcd11ad2aed Mon Sep 17 00:00:00 2001
From: Charles Coulombe <ccoulombe@users.noreply.github.com>
Date: Mon, 28 Oct 2024 04:30:46 +0000
Subject: [PATCH 155/161] Update test_multiarray.py to support numpy 2.0+
 (#138461)

Import _core instead of core.

Addresses partially #137182
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138461
Approved by: https://github.com/ezyang, https://github.com/albanD
---
 test/torch_np/numpy_tests/core/test_multiarray.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
index f0ef1f79beb54..888a4c53db57d 100644
--- a/test/torch_np/numpy_tests/core/test_multiarray.py
+++ b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -75,7 +75,12 @@
 IS_PYSTON = False
 HAS_REFCOUNT = True
 
-from numpy.core.tests._locales import CommaDecimalPointLocale
+if numpy.__version__ > "2":
+    # numpy 2.0 +, see https://numpy.org/doc/stable/release/2.0.0-notes.html#renamed-numpy-core-to-numpy-core
+    from numpy._core.tests._locales import CommaDecimalPointLocale
+else:
+    from numpy.core.tests._locales import CommaDecimalPointLocale
+
 from numpy.testing._private.utils import _no_tracing, requires_memory
 
 

From 39aa3cb8d64d01c4fa306290a0f2b37af85aa5f4 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 28 Oct 2024 05:21:31 +0000
Subject: [PATCH 156/161] Re-enable skipped ubsan tests (#139008)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139008
Approved by: https://github.com/ezyang
---
 test/test_jit.py | 47 ++++-------------------------------------------
 1 file changed, 4 insertions(+), 43 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index c3af8bc9f48fc..228b9ffc17c76 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -95,9 +95,9 @@
 # Testing utils
 from torch.testing._internal import jit_utils
 from torch.testing._internal.common_jit import check_against_reference
-from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
-    suppress_warnings, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
-    freeze_rng_state, slowTest, TemporaryFileName, \
+from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, \
+    suppress_warnings, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, \
+    TestCase, freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
@@ -16008,44 +16008,6 @@ class TestJitGeneratedModule(JitTestCase):
 class TestJitGeneratedFunctional(JitTestCase):
     pass
 
-# UBSAN per-function exclusions don't seem to work with OpenMP pragmas,
-# and we have to disable the failing tests here instead.
-UBSAN_DISABLED_TESTS = [
-    "test___rdiv___constant",
-    "test___rdiv___scalar_constant",
-    "test_addcdiv",
-    "test_addcdiv_broadcast_all",
-    "test_addcdiv_broadcast_rhs",
-    "test_addcdiv_scalar",
-    "test_addcdiv_scalar_broadcast_lhs",
-    "test_addcdiv_scalar_broadcast_rhs",
-    "test_addcdiv_scalar_scale",
-    "test_addcdiv_scalar_scale_broadcast_lhs",
-    "test_addcdiv_scalar_scale_broadcast_rhs",
-    "test_addcdiv_scale",
-    "test_addcdiv_scale_broadcast_all",
-    "test_addcdiv_scale_broadcast_rhs",
-    "test_add_broadcast_all",
-    "test_add_broadcast_lhs",
-    "test_add_broadcast_rhs",
-    "test_add_constant",
-    "test_add_scalar",
-    "test_add_scalar_broadcast_lhs",
-    "test_add_scalar_broadcast_rhs",
-    "test_div",
-    "test_div_broadcast_all",
-    "test_div_broadcast_lhs",
-    "test_div_broadcast_rhs",
-    "test_div_scalar",
-    "test_div_scalar_broadcast_lhs",
-    "test_div_scalar_broadcast_rhs",
-    "test_rsqrt",
-    "test_rsqrt_scalar",
-    "test_add",
-    "test_reciprocal",
-    "test_reciprocal_scalar",
-]
-
 L = 20
 M = 10
 S = 5
@@ -16183,8 +16145,7 @@ def post_add_test(test_name, skipTestIf, do_test, test_class):
     for skip in skipTestIf:
         do_test = skip(do_test)
 
-    if not (TEST_WITH_UBSAN and test_name in UBSAN_DISABLED_TESTS):
-        setattr(test_class, test_name, do_test)
+    setattr(test_class, test_name, do_test)
 
 
 def normalize_check_ad(check_ad, name):

From f9ae3fac8c129838554751eb102d1fc933406370 Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Mon, 28 Oct 2024 05:29:23 +0000
Subject: [PATCH 157/161] [Distributed] [19/N] Fix clang-tidy warnings in
 torch/csrc/distributed/  (#138903)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138903
Approved by: https://github.com/ezyang
---
 torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp  | 10 ++++------
 torch/csrc/distributed/c10d/DMAConnectivity.hpp      |  4 +---
 torch/csrc/distributed/c10d/GroupRegistry.cpp        |  1 +
 torch/csrc/distributed/c10d/NCCLUtils.hpp            |  2 +-
 torch/csrc/distributed/c10d/ProcessGroupGloo.hpp     |  4 +++-
 torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp     | 12 ++++++++----
 torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp     |  9 +++++----
 torch/csrc/distributed/c10d/PyProcessGroup.hpp       |  1 +
 torch/csrc/distributed/c10d/RankLocal.hpp            |  2 +-
 torch/csrc/distributed/c10d/Store.hpp                |  1 +
 torch/csrc/distributed/c10d/SymmetricMemory.hpp      | 10 ++++------
 torch/csrc/distributed/c10d/init.cpp                 |  7 +++++--
 torch/csrc/distributed/c10d/intra_node_comm.cpp      |  8 ++++++--
 .../distributed/c10d/quantization/quantization.h     |  1 -
 .../distributed/c10d/quantization/quantization_gpu.h |  1 -
 torch/csrc/distributed/c10d/sequence_num.hpp         |  4 ++--
 16 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
index 4fa907a952881..6efdcf6277678 100644
--- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp
@@ -4,8 +4,7 @@
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/SymmetricMemory.hpp>
 
-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
@@ -85,13 +84,13 @@ struct Block : public c10::intrusive_ptr_target {
       size_t block_size,
       size_t buffer_size,
       size_t signal_pad_offset,
-      const std::string& group_name)
+      std::string group_name)
       : handle(handle),
         device_idx(device_idx),
         block_size(block_size),
         buffer_size(buffer_size),
         signal_pad_offset(signal_pad_offset),
-        group_name(group_name),
+        group_name(std::move(group_name)),
         symm_mem(nullptr) {}
 };
 
@@ -113,5 +112,4 @@ class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
 };
 
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.hpp b/torch/csrc/distributed/c10d/DMAConnectivity.hpp
index cede6aa265c77..db6baa3969ef6 100644
--- a/torch/csrc/distributed/c10d/DMAConnectivity.hpp
+++ b/torch/csrc/distributed/c10d/DMAConnectivity.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <optional>
-
 #include <ATen/ATen.h>
 
 namespace c10d {
@@ -25,7 +23,7 @@ struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
 
 struct DMAConnectivityDetector : c10::intrusive_ptr_target {
   virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
-  virtual ~DMAConnectivityDetector() {}
+  ~DMAConnectivityDetector() override = default;
 };
 
 C10_EXPORT void register_dma_connectivity_detector(
diff --git a/torch/csrc/distributed/c10d/GroupRegistry.cpp b/torch/csrc/distributed/c10d/GroupRegistry.cpp
index 2a735a4c99592..c56c91ef6ec3c 100644
--- a/torch/csrc/distributed/c10d/GroupRegistry.cpp
+++ b/torch/csrc/distributed/c10d/GroupRegistry.cpp
@@ -11,6 +11,7 @@ class GroupRegistry {
  public:
   void register_group(
       std::string group_name,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       c10::intrusive_ptr<c10d::ProcessGroup> group) {
     std::unique_lock write_lock(lock_);
     auto [_, inserted] =
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 0089d453bb85a..32361e17580f5 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -370,7 +370,7 @@ class NCCLComm {
   NCCLComm& operator=(NCCLComm&& other) = delete;
 
   // Move constructable
-  // NOLINTNEXTLINE(.*-noexcept-move-.*)
+  // NOLINTNEXTLINE(*-noexcept-move-*)
   NCCLComm(NCCLComm&& other) {
     // Using other's lock, as it reads other's states
     // Can not use this.mutex_, as this object is being constructed.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
index 1ebead6b598e7..111cf14bb0809 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -6,6 +6,7 @@
 #include <deque>
 #include <mutex>
 #include <thread>
+#include <utility>
 #include <vector>
 
 #include <gloo/algorithm.h>
@@ -91,7 +92,8 @@ class TORCH_API ProcessGroupGloo : public Backend {
   // Wrap c10d store as Gloo store
   class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
    public:
-    GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
+    GlooStore(c10::intrusive_ptr<::c10d::Store> store)
+        : store_(std::move(store)) {}
 
     void setUint(const std::string& key, const std::vector<uint8_t>& value) {
       store_->set(key, value);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index c9564a31f057c..c98e76b1fd7a5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -87,6 +87,7 @@ ncclDataType_t getNcclDataType(at::ScalarType type) {
 
 bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
   switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
     case ReduceOp::SUM:
       return true;
     case ReduceOp::AVG:
@@ -119,6 +120,7 @@ ncclRedOpRAII unpackPreMulSum(
       &preMulSum,
       // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
       // tells us that the scalar input is strictly a multiplier.
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
       /*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
       dataType,
       residence,
@@ -318,6 +320,7 @@ static void cacheAllocatorRegisterHook(
     auto& ncclComm = it.first;
     auto& devIdx = it.second;
     if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
     }
   }
@@ -336,6 +339,7 @@ static void cacheAllocatorDeregisterHook(
     auto& ncclComm = it.first;
     auto& devIdx = it.second;
     if (te.device_ == devIdx) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
     }
   }
@@ -869,7 +873,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     : Backend(rank, size),
       store_(std::move(store)),
       options_(std::move(options)),
-
       traceKeyStart_(getTraceStartKey("NCCL", rank)),
       traceKeyEnd_(getTraceEndKey("NCCL", rank)),
       terminateProcessGroup_(false),
@@ -888,7 +891,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   // other threads and cause segfaults.
   const auto ncclVersion = getNcclVersion();
   this->setGroupUid(options_->group_name);
-  this->localDeviceCount_ = at::cuda::getNumGPUs();
+  this->localDeviceCount_ = static_cast<int>(at::cuda::getNumGPUs());
   logPrefix_ = createLogPrefix();
   blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
   asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
@@ -1013,8 +1016,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
     this->globalRankStride = 0;
   } else {
     bool ranksAreStrided = true;
-    int startRank = options_->global_ranks_in_group[0];
-    int stride =
+    auto startRank = options_->global_ranks_in_group[0];
+    auto stride =
         options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
     for (std::vector<uint64_t>::size_type i = 0;
          i < options_->global_ranks_in_group.size();
@@ -1377,6 +1380,7 @@ void ProcessGroupNCCL::shutdown() {
   this->abort();
 }
 
+// NOLINTNEXTLINE(bugprone-exception-escape)
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 839463a9d8be1..9ec169d13863d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -198,7 +198,8 @@ struct DumpPipe {
     if (fd_ == -1) {
       return false;
     }
-    char buf[128];
+    // NOLINTNEXTLINE(*array*)
+    char buf[128]{};
     // non-blocking from O_NONBLOCK above.
     // Ignore EINTR because we already will poll this
     // again later.
@@ -461,8 +462,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // NOTE: We intentionally store raw pointers so that
     // we do not attempt to destroy the event objects on process exit,
     // because cuda may be gone.
-    std::deque<at::cuda::CUDAEvent*>
-        eventsArray_[2]; // 0 for timing=false, 1 for timing=true
+    std::array<std::deque<at::cuda::CUDAEvent*>, 2>
+        eventsArray_; // 0 for timing=false, 1 for timing=true
   };
 
   struct Options : Backend::Options {
@@ -1181,7 +1182,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool dumpOnTimeoutOrEx_;
 
   // Whether or not to sleep after an exception is thrown in the watchdog.
-  bool sleepAfterException_;
+  bool sleepAfterException_{};
 
   // Whether or not to enable nan check for input tensors to collectives.
   bool enableNanCheck_;
diff --git a/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
index 3655984d452a9..81021bdf2c9ae 100644
--- a/torch/csrc/distributed/c10d/PyProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -212,6 +212,7 @@ class TORCH_PYTHON_API PythonOnCompletionHook {
   PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
   PythonOnCompletionHook(const PythonOnCompletionHook&) = default;
 
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~PythonOnCompletionHook() {
     py::gil_scoped_acquire ag;
     hook_.dec_ref();
diff --git a/torch/csrc/distributed/c10d/RankLocal.hpp b/torch/csrc/distributed/c10d/RankLocal.hpp
index b3a649659af4c..33f074746d287 100644
--- a/torch/csrc/distributed/c10d/RankLocal.hpp
+++ b/torch/csrc/distributed/c10d/RankLocal.hpp
@@ -55,7 +55,7 @@ class RankLocal {
   }
 
  private:
-  RankLocal(){};
+  RankLocal() = default;
   thread_local static T* cached_;
   static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
   static std::shared_mutex lock_;
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index d18de830ff7f3..0b6dfe48d0d0c 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -75,6 +75,7 @@ class TORCH_API Store : public torch::CustomClassHolder {
   // watchKey() is deprecated and no longer supported.
   virtual void watchKey(
       const std::string& /* unused */,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
       WatchKeyCallback /* unused */) {
     TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
   }
diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/SymmetricMemory.hpp
index 55b212ef90154..2eaecfce2c8cf 100644
--- a/torch/csrc/distributed/c10d/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/SymmetricMemory.hpp
@@ -3,8 +3,7 @@
 #include <ATen/ATen.h>
 #include <torch/csrc/distributed/c10d/Store.hpp>
 
-namespace c10d {
-namespace symmetric_memory {
+namespace c10d::symmetric_memory {
 
 // SymmetricMemory represents symmetric allocations across a group of devices.
 // The allocations represented by a SymmetricMemory object are accessible by
@@ -38,7 +37,7 @@ namespace symmetric_memory {
 // for these two barriers, they can operate correctly in parallel.
 class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
  public:
-  virtual ~SymmetricMemory() {}
+  ~SymmetricMemory() override = default;
 
   virtual std::vector<void*> get_buffer_ptrs() = 0;
   virtual std::vector<void*> get_signal_pad_ptrs() = 0;
@@ -72,7 +71,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 
 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
  public:
-  virtual ~SymmetricMemoryAllocator(){};
+  ~SymmetricMemoryAllocator() override = default;
 
   virtual void* alloc(
       size_t size,
@@ -159,5 +158,4 @@ TORCH_API c10::intrusive_ptr<SymmetricMemory> get_symmetric_memory(
 TORCH_API bool has_multicast_support(
     c10::DeviceType device_type,
     int device_idx);
-} // namespace symmetric_memory
-} // namespace c10d
+} // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b1cebfe0502be..2c3f836a394f7 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -109,6 +109,7 @@ class IntrusivePtrNoGilDestructor {
   explicit IntrusivePtrNoGilDestructor(T* impl)
       // NOLINTNEXTLINE(bugprone-exception-escape)
       : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
+  // NOLINTNEXTLINE(bugprone-exception-escape)
   ~IntrusivePtrNoGilDestructor() {
     if (impl_) {
       if (PyGILState_Check()) {
@@ -338,6 +339,7 @@ class PythonRequest : public ::c10d::control_plane::Request {
 };
 class PythonResponse : public ::c10d::control_plane::Response {
  public:
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
   void setContent(std::string&& content, const std::string& content_type)
       override {
     PYBIND11_OVERRIDE_PURE_NAME(
@@ -1610,7 +1612,7 @@ that adds a prefix to each key inserted to the store.
             if (!store) {
               throw py::value_error("store argument cannot be None");
             }
-            return new ::c10d::PrefixStore(prefix, store);
+            return new ::c10d::PrefixStore(prefix, std::move(store));
           }),
           py::arg("prefix"),
           py::arg("store"))
@@ -2755,7 +2757,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def(
               "_verify_work_timeout",
               [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self,
-                 const c10::intrusive_ptr<::c10d::Work> work,
+                 const c10::intrusive_ptr<::c10d::Work>& work,
                  const std::chrono::milliseconds& timeout) {
                 return self->verifyWorkTimeoutForTest(work, timeout);
               },
@@ -3415,6 +3417,7 @@ static PyMethodDef methods[] = { // NOLINT
     {"_c10d_init", c10d_init, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 PyMethodDef* python_functions() {
   return methods;
 }
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
index 6a61f16e9aea5..c0c53d220d86d 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -7,6 +7,7 @@
 
 namespace c10d::intra_node_comm {
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 bool isIntraNodeCommSupported();
 
 static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
@@ -86,7 +87,7 @@ bool IntraNodeComm::isEnabled() {
  * Use c10d::Store to perform allgather on a trivially copyable type.
  */
 template <typename T>
-std::vector<T> storeAllGather(
+static std::vector<T> storeAllGather(
     const c10::intrusive_ptr<c10d::Store>& store,
     const std::string& prefix,
     size_t rank,
@@ -134,10 +135,12 @@ bool IntraNodeComm::rendezvous() {
     return false;
   }
 
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   deviceIdx_ = at::cuda::current_device();
 
   // Exchange hostname and device bus ID
   struct DevInfo {
+    // NOLINTNEXTLINE
     char hostname[HOST_NAME_MAX + 1];
     int deviceIdx;
   };
@@ -170,7 +173,8 @@ bool IntraNodeComm::rendezvous() {
   }
 
   auto groupName = "IntraNodeComm" + std::to_string(intraNodeCommIdx++);
-  set_group_info(groupName, rank_, worldSize_, store_);
+  set_group_info(
+      groupName, static_cast<int>(rank_), static_cast<int>(worldSize_), store_);
   auto allocator = get_allocator(c10::DeviceType::CUDA);
   symmetricMemoryPtr_ = allocator->alloc(bufferSize_, deviceIdx_, groupName);
   symmetricMemory_ = allocator->rendezvous(symmetricMemoryPtr_);
diff --git a/torch/csrc/distributed/c10d/quantization/quantization.h b/torch/csrc/distributed/c10d/quantization/quantization.h
index 3d2f23de421bb..1a398d75004e8 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization.h
@@ -6,7 +6,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <vector>
 
 namespace torch::distributed::c10d::quantization {
 
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
index f865599595d32..c45d600b780f0 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
@@ -6,7 +6,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <vector>
 
 namespace torch::distributed::c10d::quantization {
 
diff --git a/torch/csrc/distributed/c10d/sequence_num.hpp b/torch/csrc/distributed/c10d/sequence_num.hpp
index 38bd4cb5ed9d3..a32bb3dd6026f 100644
--- a/torch/csrc/distributed/c10d/sequence_num.hpp
+++ b/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -7,11 +7,11 @@
 #include <vector>
 
 namespace c10d {
-const int kUnsetSeqNum = 0;
+constexpr int kUnsetSeqNum = 0;
 
 namespace {
 constexpr int kByteOffset = 8;
-}
+} // namespace
 
 // Converts from int to char vec to write in store
 template <typename T>

From fac74687a602b9db23c962a6a097541bb7acf810 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@meta.com>
Date: Sun, 27 Oct 2024 00:21:25 -0700
Subject: [PATCH 158/161] [compiled autograd] fix node origin graph comments
 (#139003)

the comment update was done after prehooks were already collected, so prehooks would appear as part of the previous node

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139003
Approved by: https://github.com/yf225
---
 .../csrc/dynamo/python_compiled_autograd.cpp  | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index 024603270f787..eb11bd1b65ae6 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -647,6 +647,19 @@ CacheNode* _compiled_autograd_impl(
 
     for (size_t i = 0; i < calls.size(); i++) {
       NodeCall& call = *calls[i];
+
+      std::string _node_name = call.node->name();
+      THPObjectPtr node_name(PyUnicode_FromString(_node_name.data()));
+      TORCH_INTERNAL_ASSERT(node_name != nullptr);
+      THPObjectPtr set_node_origin(
+          PyObject_GetAttrString(py_compiler.get(), "set_node_origin"));
+      PyObject* pyobj = Py_None;
+      if (auto pynode = std::dynamic_pointer_cast<PyNode>(call.node)) {
+        pyobj = pynode->obj;
+      }
+      check(PyObject_CallFunction(
+          set_node_origin, "OIO", node_name.get(), i, pyobj, nullptr));
+
       // TODO(jansel): consider adding some of this stuff:
       // guard(local_graph_task); NodeGuard ndguard(task.fn_); const auto
       // opt_parent_stream = (*func).stream(c10::DeviceType::CUDA);
@@ -692,20 +705,6 @@ CacheNode* _compiled_autograd_impl(
         inputs = THPVariable_UnpackList(pyinputs);
       }
 
-      std::string _node_name = call.node->name();
-      THPObjectPtr node_name(PyUnicode_FromString(_node_name.data()));
-      TORCH_INTERNAL_ASSERT(node_name != nullptr);
-      THPObjectPtr set_node_origin(
-          PyObject_GetAttrString(py_compiler.get(), "set_node_origin"));
-
-      PyObject* pyobj = Py_None;
-      if (auto pynode = std::dynamic_pointer_cast<PyNode>(call.node)) {
-        pyobj = pynode->obj;
-      }
-
-      check(PyObject_CallFunction(
-          set_node_origin, "OIO", node_name.get(), i, pyobj, nullptr));
-
       SwapSavedVariables saved(compiler_call, state, py_compiler.get(), call);
       variable_list outputs = call.node->apply_with_saved(inputs, saved);
 

From dd9ff9f139d7eb81ed1120bddf2bd91d771a9983 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@meta.com>
Date: Sun, 27 Oct 2024 20:14:15 -0700
Subject: [PATCH 159/161] [compiled autograd] add tests for bwd hooks relative
 firing order (#139004)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/139004
Approved by: https://github.com/yf225
ghstack dependencies: #139003
---
 test/inductor/test_compiled_autograd.py | 58 ++++++++++++++++
 test/test_autograd.py                   | 91 +++++++++++++++++++++++++
 2 files changed, 149 insertions(+)

diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 80b174cc4ae3e..f37b71ca59a99 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -2802,6 +2802,63 @@ def fn(x):
         with torch._dynamo.compiled_autograd.enable(torch.compile):
             out.backward()
 
+    @skipIfWindows(msg="node name demangling inconsistent on windows")
+    def test_backward_hook_relative_ordering_partial(self):
+        # test backward hooks for cases that CA matches eager
+
+        def fn():
+            order = []
+
+            class MyModule(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = torch.nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    return self.linear(x)
+
+            x = torch.randn(10, 10)
+            module = MyModule()
+
+            def make_pre_hook(id):
+                return lambda _: order.append(f"pre_hook_{id}")
+
+            def make_post_hook(id):
+                return lambda _1, _2: order.append(f"post_hook_{id}")
+
+            count = 0
+
+            def register_hooks_on_all_nodes(nodes):
+                nonlocal count
+                for node, _ in nodes:
+                    if node is None:
+                        continue
+                    count += 1
+                    id = f"{node.name()}_{count}"
+                    node.register_prehook(make_pre_hook(id))
+                    node.register_hook(make_post_hook(id))
+                    register_hooks_on_all_nodes(node.next_functions)
+
+            loss = module(x).sum()
+            register_hooks_on_all_nodes(((loss.grad_fn, None),))
+
+            def make_tensor_pre_hook(id):
+                return lambda _: order.append(f"tensor_pre_hook_{id}")
+
+            def make_post_acc_grad_hook(id):
+                return lambda _: order.append(f"post_acc_grad_hook_{id}")
+
+            module.linear.weight.register_hook(make_tensor_pre_hook("weight"))
+
+            module.linear.weight.register_post_accumulate_grad_hook(
+                make_post_acc_grad_hook("weight")
+            )
+
+            loss.backward()
+            yield tuple(order)
+
+        self.check_output_and_recompiles(fn)
+
 
 def load_test_module(name):
     testdir = Path(__file__).absolute().parent.parent
@@ -2993,6 +3050,7 @@ def wrap_test_class(orig_cls):
     # Category: Divergence from eager
     "test_invalid_gradients",  # can't give autograd error due to inaccurate output metadata of lifted backward
     "test_autograd_node_isinstance",  # backward ctx is a fake cls and not directly a Node instance
+    "test_backward_hook_relative_ordering",  # compiled autograd collects breadth first, and module backward hook not supported
     # Uncategorized
 }
 
diff --git a/test/test_autograd.py b/test/test_autograd.py
index f25ca30fd963b..4cfb413d2c85c 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -74,6 +74,7 @@
     skipIfMps,
     skipIfNoLapack,
     skipIfTorchDynamo,
+    skipIfWindows,
     slowTest,
     TestCase,
     xfailIfTorchDynamo,
@@ -4592,6 +4593,96 @@ def hook(t_):
         ):
             t.backward()
 
+    @skipIfWindows(msg="node name demangling inconsistent on windows")
+    def test_backward_hook_relative_ordering(self):
+        order = []
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        x = torch.randn(10, 10, requires_grad=True)
+        module = MyModule()
+        module.register_full_backward_hook(
+            lambda _1, _2, _3: order.append(
+                "module_full_backward_hook_BackwardHookFunctionBackward0"
+            )
+        )
+
+        def make_pre_hook(id):
+            return lambda _: order.append(f"pre_hook_{id}")
+
+        def make_post_hook(id):
+            return lambda _1, _2: order.append(f"post_hook_{id}")
+
+        count = 0
+
+        def register_hooks_on_all_nodes(nodes):
+            nonlocal count
+            for node, _ in nodes:
+                count += 1
+                id = f"{node.name()}_{count}"
+                node.register_prehook(make_pre_hook(id))
+                node.register_hook(make_post_hook(id))
+                register_hooks_on_all_nodes(node.next_functions)
+
+        loss = module(x).sum()
+        register_hooks_on_all_nodes(((loss.grad_fn, None),))
+
+        def make_tensor_pre_hook(id):
+            return lambda _: order.append(f"tensor_pre_hook_{id}")
+
+        def make_post_acc_grad_hook(id):
+            return lambda _: order.append(f"post_acc_grad_hook_{id}")
+
+        x.register_hook(make_tensor_pre_hook("x"))
+        module.linear.weight.register_hook(make_tensor_pre_hook("weight"))
+        module.linear.bias.register_hook(make_tensor_pre_hook("bias"))
+
+        x.register_post_accumulate_grad_hook(make_post_acc_grad_hook("x"))
+        module.linear.weight.register_post_accumulate_grad_hook(
+            make_post_acc_grad_hook("weight")
+        )
+        module.linear.bias.register_post_accumulate_grad_hook(
+            make_post_acc_grad_hook("bias")
+        )
+
+        loss.backward()
+
+        expected_order = [
+            "pre_hook_SumBackward0_1",
+            "post_hook_SumBackward0_1",
+            "pre_hook_BackwardHookFunctionBackward_2",
+            "post_hook_BackwardHookFunctionBackward_2",
+            "pre_hook_AddmmBackward0_3",
+            "post_hook_AddmmBackward0_3",
+            "tensor_pre_hook_bias",
+            "pre_hook_torch::autograd::AccumulateGrad_4",
+            "post_acc_grad_hook_bias",
+            "post_hook_torch::autograd::AccumulateGrad_4",
+            "pre_hook_TBackward0_7",
+            "post_hook_TBackward0_7",
+            "tensor_pre_hook_weight",
+            "pre_hook_torch::autograd::AccumulateGrad_8",
+            "post_acc_grad_hook_weight",
+            "post_hook_torch::autograd::AccumulateGrad_8",
+            "pre_hook_BackwardHookFunctionBackward_5",
+            "module_full_backward_hook_BackwardHookFunctionBackward0",
+            "post_hook_BackwardHookFunctionBackward_5",
+            "tensor_pre_hook_x",
+            "pre_hook_torch::autograd::AccumulateGrad_6",
+            "post_acc_grad_hook_x",
+            "post_hook_torch::autograd::AccumulateGrad_6",
+        ]
+
+        self.assertEqual(len(expected_order), len(order))
+        for expected, actual in zip(expected_order, order):
+            self.assertEqual(expected, actual)
+
     def test_view_replay_enabled(self):
         def f(x):
             out = x.clone().view(-1)

From a99e8eeb976df8def52104d92407f882ba6e5f97 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Mon, 28 Oct 2024 06:27:36 +0000
Subject: [PATCH 160/161] Propagate real tensor tracing with torchbind + fixing
 side effects (#138797)

Summary:
* Fixed real tensor tracing w/ torchbind objs by passing the cloned tensor obj. For now I just catch the exception and have an error message if the `_clone` fails, but up for discussion on what to do here
  * Separate question, should we require people to set up FakeScriptObjects and stuff for draft mode?
* Prevent side effects from happening when we do the first pass of custom ops profiling by cloning/copying everything. Not sure if deepcopying the model will succeed in all cases... But also I guess this path can be removed once custom ops profiling turns into one pass.

Test Plan: `buck2 run @//mode/dev-nosan //scripts/angelayi/draft_export:test_draft_export`

Reviewed By: ydwu4

Differential Revision: D64124825

Pull Request resolved: https://github.com/pytorch/pytorch/pull/138797
Approved by: https://github.com/ydwu4
---
 torch/_library/fake_class_registry.py | 15 ++++++++++++++-
 torch/_subclasses/fake_tensor.py      |  7 ++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index 3c2689f5328f5..655213a051291 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -1,9 +1,11 @@
 # mypy: allow-untyped-defs
+import copy
 import logging
 from typing import Any, Dict, Optional, Protocol, Tuple, Union
 
 import torch
 from torch._library.utils import parse_namespace
+from torch.utils._python_dispatch import _disable_current_modes
 
 
 log = logging.getLogger(__name__)
@@ -15,7 +17,18 @@ def __init__(self, wrapped_obj: Any, script_class_name: str, x: torch.ScriptObje
 
         # The fully qualified name of the class of original script object
         self.script_class_name = script_class_name
-        self.real_obj = x
+        try:
+            with _disable_current_modes():
+                self.real_obj = copy.deepcopy(x)
+        except RuntimeError:
+            log.warning(
+                "Unable to deepcopy the custom object %s. "
+                "Defaulting to the user given object. This might be "
+                "dangerous as side effects may be directly applied "
+                "to the object.",
+                script_class_name,
+            )
+            self.real_obj = x
 
 
 class FakeScriptMethod:
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index df2fef74e127e..df8a2876586c2 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -38,6 +38,7 @@
 import torch
 from torch import SymBool, SymFloat, SymInt, Tensor
 from torch._C._functorch import is_functorch_wrapped_tensor, is_legacy_batchedtensor
+from torch._library.fake_class_registry import FakeScriptObject
 from torch._prims_common import suggest_memory_format
 from torch._subclasses.meta_utils import (
     assert_eq,
@@ -1947,7 +1948,9 @@ def _dispatch_impl(
         args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        def maybe_to_real_tensor(t: T) -> Optional[Union[T, Tensor]]:
+        def maybe_to_real_tensor(
+            t: T,
+        ) -> Optional[Union[T, Tensor, torch._C.ScriptObject]]:
             if isinstance(t, FakeTensor):
                 return t.real_tensor
             elif isinstance(t, py_sym_types):
@@ -1957,6 +1960,8 @@ def maybe_to_real_tensor(t: T) -> Optional[Union[T, Tensor]]:
                         self.shape_env.unbacked_var_to_val
                     )
                 )
+            elif isinstance(t, FakeScriptObject):
+                return t.real_obj
             else:
                 return t
 

From 633dcf1a2d8007a41a13250e36cb8f33ed85bc4a Mon Sep 17 00:00:00 2001
From: Tuan Trieu <tuant@meta.com>
Date: Mon, 28 Oct 2024 06:28:30 +0000
Subject: [PATCH 161/161] Constant folding for lifted graph (#135060)

Summary:
Current implementation for lifted graph takes a dict of [constant name: constant value]. And the constant value is used to run_node and excute the constant graph to get the folded values and then create new getattr nodes for folded values.

We don't have constant values for lifted graph during model compilation on MTIA. I think it is more general to allow the constant folding pass to just take the constant names only to produce the constant graph and represent the folded nodes as placeholders to make it consistent with lifted graph. Additionally, this mimic the real situation on Sigmoid, where Sigmoid executes the constant graph, get the folded values and set the folded values to the main graph. This diff is to update the pass to work with a list of constant names.

Test Plan:
```
buck run mode/opt caffe2/test:test_export -- -r split_const_gm
```

Differential Revision: D62144791

Pull Request resolved: https://github.com/pytorch/pytorch/pull/135060
Approved by: https://github.com/SherlockNoMad

Co-authored-by: Tuan Trieu <tuant@meta.com>
---
 test/export/test_export.py          |  35 ++++++++--
 torch/_inductor/compile_fx.py       |  14 ++--
 torch/_inductor/constant_folding.py | 103 ++++++++++++++++------------
 3 files changed, 97 insertions(+), 55 deletions(-)

diff --git a/test/export/test_export.py b/test/export/test_export.py
index d00ceac949cf8..2ff07ebbdc3c4 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -8089,7 +8089,9 @@ def forward(self, x):
                 w_transpose = torch.transpose(self.w_pre, 0, 1)
                 w_relu = torch.nn.functional.relu(w_transpose)
                 w = w_relu + self.b
-                return torch.matmul(x, w)
+                return (
+                    torch.matmul(x, w) + self.b + torch.arange(4, dtype=torch.float16)
+                )
 
         example_inputs = (torch.randn(4, 4),)
         mod = Model()
@@ -8105,17 +8107,38 @@ def forward(self, x):
             for n, spec in zip(placeholder_nodes, new_sig.input_specs)
             if spec.target is not None
         }
-        const_gm, _ = split_const_gm(new_gm, lifted_constants)
+        # [self.w_pre, self.b]
+        lifted_constant_names = list(lifted_constants)
+        lifted_constant_values = [lifted_constants[n] for n in lifted_constant_names]
+        const_gm, _ = split_const_gm(new_gm, False, lifted_constant_names)
         counter = 0
         for node in const_gm.graph.nodes:
             if node.op == "call_function":
                 counter += 1
-        self.assertTrue(counter > 0)
+        self.assertTrue(counter == 4)
+        counter = 0
+        for n in new_gm.graph.nodes:
+            if n.op == "placeholder":
+                counter += 1
+        # expect 3 existing placeholders and 2 folded constant
+        self.assertTrue(counter == 5)
+        # return (self.b, folded_const, folded_const)
+        const_folded_value = const_gm(*lifted_constant_values)
+
         test_input = torch.randn(4, 4)
-        expected = new_gm(None, None, test_input)[0]
-        actual = mod(test_input)
+        # new_gm(c_w_pre, b, x, folded_const, folded_const)
+        actual = new_gm(
+            lifted_constant_values[0],
+            const_folded_value[0],
+            test_input,
+            const_folded_value[1],
+            const_folded_value[2],
+        )[0]
+        expected = mod(test_input)
         self.assertEqual(actual, expected)
-        const_gm, _ = split_const_gm(ep.graph_module, lifted_constants, lambda x: True)
+        const_gm, _ = split_const_gm(
+            ep.graph_module, False, lifted_constant_names, lambda x: True
+        )
         counter = 0
         for node in const_gm.graph.nodes:
             if node.op == "call_function":
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 0b9bf0c8b6f27..ea28ad9d3732e 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -350,7 +350,8 @@ def _recursive_post_grad_passes(gm: GraphModule, is_inference: bool = False) ->
 
 def split_const_gm(
     gm: GraphModule,
-    lifted_constants: Optional[Dict[str, Any]] = None,
+    skip_constructor: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
     skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> Tuple[GraphModule, Dict[str, int]]:
     """
@@ -377,9 +378,10 @@ def split_const_gm(
         run_and_get_constant_graph,
     )
 
-    const_gm, const_result = run_and_get_constant_graph(
-        gm, lifted_constants, skip_folding_node_fn
+    const_gm = run_and_get_constant_graph(
+        gm, skip_constructor, lifted_constant_names, skip_folding_node_fn
     )
+    const_result = const_gm() if lifted_constant_names is None else None
 
     const_outputs = {
         x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
@@ -399,7 +401,11 @@ def split_const_gm(
         replace_node_with_constant(
             gm,
             node,
-            const_result[const_outputs[node.name]],
+            (
+                const_result[const_outputs[node.name]]
+                if lifted_constant_names is None
+                else None
+            ),
             new_const_name,
         )
         const_output_index[new_const_name] = const_outputs[node.name]
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
index 09abe579b5204..2d0df289316b5 100644
--- a/torch/_inductor/constant_folding.py
+++ b/torch/_inductor/constant_folding.py
@@ -1,5 +1,5 @@
 import collections
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -18,7 +18,7 @@
 def replace_node_with_constant(
     gm: torch.fx.GraphModule,
     node: torch.fx.Node,
-    constant: torch.Tensor,
+    constant: Optional[torch.Tensor] = None,
     name: Optional[str] = None,
 ) -> None:
     g = gm.graph
@@ -39,24 +39,25 @@ def replace_node_with_constant(
         gm._frozen_param_count = i + 1
 
     with g.inserting_before(node):
-        new_input_node = g.create_node("get_attr", qualname, (), {})
+        if constant is not None:
+            new_input_node = g.create_node("get_attr", qualname, (), {})
+        else:
+            # this is the case for lifted constants
+            new_input_node = g.create_node("placeholder", qualname, (), {})
         node.replace_all_uses_with(new_input_node)
         new_input_node.meta.update(node.meta)
         g.erase_node(node)
 
-    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
-    gm.register_buffer(qualname, constant)
-    setattr(gm, qualname, constant)
+    if constant is not None:
+        # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+        gm.register_buffer(qualname, constant)
+        setattr(gm, qualname, constant)
 
 
 def is_const_source(
-    node: torch.fx.Node, lifted_constants: Optional[Dict[str, Any]]
+    node: torch.fx.Node, lifted_constant_names: Optional[List[str]]
 ) -> bool:
-    return node.op == "get_attr" or (
-        node.op == "placeholder"
-        and lifted_constants is not None
-        and node.name in lifted_constants
-    )
+    return node.op == "get_attr" or node.name in (lifted_constant_names or ())
 
 
 class ConstantFolder(torch.fx.Interpreter):
@@ -64,7 +65,7 @@ def __init__(
         self,
         gm: torch.fx.GraphModule,
         skip_constructors: bool = False,
-        lifted_constants: Optional[Dict[str, torch.Tensor]] = None,
+        lifted_constant_names: Optional[List[str]] = None,
         skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
     ) -> None:
         super().__init__(gm)
@@ -76,14 +77,27 @@ def __init__(
         # overwrite this to deallocate env values if their only remaining use
         # is the output
         self.user_to_last_uses = self.node_to_last_non_output_use()
-        self.lifted_constants = lifted_constants
+        self.lifted_constant_names = lifted_constant_names
+        self.deferred_value = object()
 
     def _support_dynamic_shape(self) -> bool:
         # ConstantFolder not support dynamic shape now
         return False
 
     def _deduce_value(self, node: torch.fx.Node) -> Any:
-        return super().run_node(node)
+        if self.lifted_constant_names is None:
+            return super().run_node(node)
+        # if lifted_constant_names is passed in, no concrete value is available
+        # so we just check if all inputs have values
+        flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+        for inp in flattened_node_inps:
+            if (
+                isinstance(inp, torch.fx.Node)
+                and inp.name not in (self.lifted_constant_names or ())
+                and self.env[inp] != self.deferred_value
+            ):
+                return self.unknown_value
+        return self.deferred_value
 
     def is_impure(self, node: torch.fx.node.Node) -> bool:
         def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
@@ -103,7 +117,7 @@ def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
                 and is_woq_int8_pattern(next(iter(node.users)))
             )
         ) and is_const_source(
-            node.args[0], self.lifted_constants  # type: ignore[arg-type]
+            node.args[0], self.lifted_constant_names  # type: ignore[arg-type]
         ):
             # Case 1: int8_weight -> dq -> bf16_weight
             # Case 2: int8_weight -> permute -> dq -> bf16_weight
@@ -191,7 +205,7 @@ def set_env(arg: torch.fx.Node) -> None:
         # TODO - more complicated strategy
         if (
             self.skip_constructors
-            and not is_const_source(node, self.lifted_constants)
+            and not is_const_source(node, self.lifted_constant_names)
             and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
         ):
             return self.unknown_value
@@ -207,10 +221,10 @@ def set_env(arg: torch.fx.Node) -> None:
         if out == self.unknown_value:
             return self.unknown_value
 
-        if not is_const_source(node, self.lifted_constants) and isinstance(
-            out, torch.Tensor
+        if not is_const_source(node, self.lifted_constant_names) and (
+            isinstance(out, torch.Tensor) or out == self.deferred_value
         ):
-            if out.device.type == "meta":
+            if out != self.deferred_value and out.device.type == "meta":
                 return out
 
             if not self.insertable_tensor_check(out):
@@ -248,10 +262,12 @@ def run(self) -> Any:  # type: ignore[override]
 
     def insert_placerholder_values(self, env: Dict[torch.fx.Node, Any]) -> None:
         for n in self.module.graph.find_nodes(op="placeholder"):
-            if self.lifted_constants is not None and n.name in self.lifted_constants:
-                env[n] = self.lifted_constants[n.name]
-            else:
-                env[n] = self.unknown_value  # type: ignore[assignment]
+            env[n] = self.unknown_value  # type: ignore[assignment]
+        if self.lifted_constant_names is None:
+            return
+        for n in self.module.graph.nodes:
+            if n.name in (self.lifted_constant_names or ()):
+                env[n] = self.deferred_value
 
 
 def constant_fold(
@@ -284,12 +300,15 @@ def constant_fold(
 
 def constant_graph_tag(
     gm: torch.fx.GraphModule,
-    lifted_constants: Optional[Dict[str, Any]],
-    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
+    skip_constructors: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> None:
     with torch.utils._python_dispatch._disable_current_modes():
         cf = ConstantFolder(
-            gm, skip_constructors=True, lifted_constants=lifted_constants
+            gm,
+            skip_constructors=skip_constructors,
+            lifted_constant_names=lifted_constant_names,
         )
         cf.run()
 
@@ -298,7 +317,7 @@ def constant_graph_tag(
                 node.meta[META_TAG] = MODULE_TAG
                 continue
             if (
-                is_const_source(node, lifted_constants)
+                is_const_source(node, lifted_constant_names)
                 or node in cf.node_replacements
                 or node in cf.replaced_uses
             ):
@@ -309,15 +328,18 @@ def constant_graph_tag(
 
 def run_and_get_constant_graph(
     gm: torch.fx.GraphModule,
-    lifted_constants: Optional[Dict[str, Any]],
-    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]],
-) -> Tuple[torch.fx.GraphModule, Tuple[torch.Tensor, ...]]:
+    skip_constructors: bool = True,
+    lifted_constant_names: Optional[List[str]] = None,
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+) -> torch.fx.GraphModule:
     """
     Construct a GraphModule which corresponds to the part which could be
     constant folded in provided gm.
     """
 
-    constant_graph_tag(gm, lifted_constants, skip_folding_node_fn)
+    constant_graph_tag(
+        gm, skip_constructors, lifted_constant_names, skip_folding_node_fn
+    )
 
     def untag(node: torch.fx.Node) -> bool:
         used_to_fold = False
@@ -329,19 +351,11 @@ def untag(node: torch.fx.Node) -> bool:
             node.meta[META_TAG] = MODULE_TAG
         return used_to_fold
 
-    const_args = []
-    if lifted_constants is not None:
-        placeholders = list(gm.graph.find_nodes(op="placeholder"))
-        for node in placeholders:
-            if node.meta[META_TAG] == MODULE_TAG:
-                continue
-            if untag(node):
-                const_args.append(lifted_constants[node.name])
-
     # We rewrite the tags, if it's a constant being directly consumed, without
     # any folding opportunity, we keep it in main gm.
-    for node in gm.graph.find_nodes(op="get_attr"):
-        untag(node)
+    for node in gm.graph.nodes:
+        if node.op == "getattr" or (node.name in (lifted_constant_names or ())):
+            untag(node)
 
     new_graph = torch.fx.Graph()
 
@@ -363,5 +377,4 @@ def untag(node: torch.fx.Node) -> bool:
     new_graph.lint()
     new_gm = torch.fx.GraphModule(gm, new_graph)
 
-    const_result = new_gm(*const_args)
-    return new_gm, const_result
+    return new_gm