From 31aa3369d4056813ddda532161e717e571c44ce9 Mon Sep 17 00:00:00 2001
From: Sandeep Datta <128171450+sandeepd-nv@users.noreply.github.com>
Date: Tue, 26 Sep 2023 18:24:09 +0530
Subject: [PATCH 01/18] GH artifacts based CI (#1043)

---
 .github/actions/download-artifacts/action.yml |  48 ++++++
 .github/workflows/ci-gh.yml                   |  11 +-
 .github/workflows/gh-build-and-test.yml       |  88 +++++++++--
 .github/workflows/gh-build.yml                | 148 ++++++++----------
 .github/workflows/gh-cleanup.yml              |  43 -----
 .github/workflows/gh-test.yml                 |  91 +++++++++++
 cmake/versions.json                           |   2 +-
 conda/conda-build/meta.yaml                   |   4 +-
 continuous_integration/Dockerfile             |  44 ------
 continuous_integration/dot-gitconfig          |   3 +
 .../home/coder/.local/bin/build-cunumeric-all |  17 --
 .../scripts/build-cunumeric-all               |  44 ++++++
 .../bin => scripts}/build-cunumeric-conda     |  19 ++-
 .../bin => scripts}/build-cunumeric-cpp       |   1 -
 .../bin => scripts}/build-cunumeric-wheel     |   0
 continuous_integration/scripts/entrypoint     |  46 ++++++
 continuous_integration/scripts/test-cunumeric |  61 ++++++++
 17 files changed, 458 insertions(+), 212 deletions(-)
 create mode 100644 .github/actions/download-artifacts/action.yml
 delete mode 100644 .github/workflows/gh-cleanup.yml
 create mode 100644 .github/workflows/gh-test.yml
 delete mode 100644 continuous_integration/Dockerfile
 create mode 100644 continuous_integration/dot-gitconfig
 delete mode 100644 continuous_integration/home/coder/.local/bin/build-cunumeric-all
 create mode 100755 continuous_integration/scripts/build-cunumeric-all
 rename continuous_integration/{home/coder/.local/bin => scripts}/build-cunumeric-conda (86%)
 rename continuous_integration/{home/coder/.local/bin => scripts}/build-cunumeric-cpp (93%)
 rename continuous_integration/{home/coder/.local/bin => scripts}/build-cunumeric-wheel (100%)
 create mode 100755 continuous_integration/scripts/entrypoint
 create mode 100755 continuous_integration/scripts/test-cunumeric

diff --git a/.github/actions/download-artifacts/action.yml b/.github/actions/download-artifacts/action.yml
new file mode 100644
index 0000000000..e8019b1b19
--- /dev/null
+++ b/.github/actions/download-artifacts/action.yml
@@ -0,0 +1,48 @@
+name: setup-legate-conda
+
+description: Download dependencies (artifacts)
+
+inputs:
+  device: {type: string, required: true}
+  git_sha: {type: string, required: true}
+
+runs:
+  using: composite
+  steps:
+
+    - id: cache
+      name: Cache conda artifacts
+      uses: actions/cache@v3
+      with:
+        key: "nv-legate/legate.core@${{ inputs.git_sha }}-${{ inputs.device }}"
+        path: .artifacts
+
+    - if: steps.cache.outputs.cache-hit != 'true'
+      name: Download conda artifacts
+      uses: dawidd6/action-download-artifact@v2
+      with:
+        path: .artifacts-dl
+        repo: nv-legate/legate.core
+        commit: ${{ inputs.git_sha }}
+        workflow_conclusion: success
+        workflow: "ci-gh-${{ inputs.device }}-build-and-test.yml"
+        name: "legate.core-${{ inputs.device }}-[0-9a-z]{40}"
+        name_is_regexp: true
+
+    - if: steps.cache.outputs.cache-hit != 'true'
+      name: Move conda artifacts into cached dir
+      shell: bash --noprofile --norc -xeo pipefail {0}
+      run: |
+        mkdir -p .artifacts;
+        find .artifacts-dl/legate.core-${{ inputs.device }}-*/ \
+          -maxdepth 2 -type d -name legate_core -exec mv {} .artifacts/ \;
+        find .artifacts-dl/legate.core-${{ inputs.device }}-*/ \
+          -maxdepth 2 -type f -name "environment*.yaml" -exec mv {} .artifacts/ \;
+
+    - name: Copy and change cache dir ownership
+      shell: bash --noprofile --norc -xeo pipefail {0}
+      run: |
+        # Copy and change directory ownership
+        cp -ar .artifacts /home/coder/.artifacts;
+        chown -R coder:coder /home/coder/.artifacts;
+        ls -R /home/coder/.artifacts
diff --git a/.github/workflows/ci-gh.yml b/.github/workflows/ci-gh.yml
index f1aafdd595..ffb77c10e8 100644
--- a/.github/workflows/ci-gh.yml
+++ b/.github/workflows/ci-gh.yml
@@ -16,10 +16,13 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - {build-target: cpu}
-          - {build-target: gpu}
+          - device: "gpu"
+            image: "rapidsai/devcontainers:23.06-cpp-mambaforge-ubuntu22.04"
+
+          - device: "cpu"
+            image: "rapidsai/devcontainers:23.06-cpp-mambaforge-ubuntu22.04"
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
-      build-target: ${{ matrix.build-target }}
-      sha: ${{ github.sha }}
+      device: ${{ matrix.device }}
+      image: ${{ matrix.image }}
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index f297b97618..e0e87899da 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -1,32 +1,92 @@
 on:
   workflow_call:
     inputs:
-      build-target:
-        required: true
+      image:
         type: string
-      sha:
         required: true
+      device:
         type: string
+        required: true
+
 
 jobs:
   build:
-    name: "Build cunumeric (with ${{ inputs.build-target }} legate) on GH"
+    name: "Build cunumeric (with ${{ inputs.device }} legate) on GH"
     uses:
       ./.github/workflows/gh-build.yml
     with:
-      build-target: ${{ inputs.build-target }}
-      # Ref: https://docs.rapids.ai/resources/github-actions/#cpu-labels for `linux-amd64-cpu4`
-      runs-on: ${{ github.repository_owner == 'nv-legate' && 'linux-amd64-cpu4' || 'ubuntu-latest' }}
-      sha: ${{ inputs.sha }}
+      device: ${{ inputs.device }}
+      image: ${{ inputs.image }}
+      runs-on: ${{ github.repository_owner == 'nv-legate' && 'linux-amd64-32cpu' || 'ubuntu-latest' }}
 
-  cleanup:
+  test:
     needs:
       - build
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: 1 CPU test
+            options: test --cpus 1 --unit --debug
+            runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-cpu4' }}
+            has-gpu: false
+            enabled: true
+
+          - name: 2 CPUs test
+            options: test --cpus 2 --debug
+            runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-cpu8' }}
+            has-gpu: false
+            enabled: true
+
+          - name: GPU test
+            options: test --use cuda --gpus 1 --debug
+            runner: linux-amd64-gpu-v100-latest-1
+            has-gpu: true
+            enabled: ${{ inputs.device == 'gpu' }}
+
+          - name: 2 GPUs test
+            options: test --use cuda --gpus 2 --debug
+            runner: linux-amd64-2gpu
+            has-gpu: true
+            enabled: ${{ inputs.device == 'gpu' }}
+
+          - name: OpenMP test
+            options: test --use openmp --omps 1 --ompthreads 2 --debug
+            runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-32cpu' }}
+            has-gpu: ${{ inputs.device == 'gpu' }}
+            enabled: false
+
+          - name: 2 NUMA OpenMPs test
+            options: test --use openmp --omps 2 --ompthreads 2 --numamem 2048 --debug
+            runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-32cpu' }}
+            has-gpu: ${{ inputs.device == 'gpu' }}
+            enabled: false
+
+          - name: Eager execution test
+            options: test --use eager --debug
+            runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-cpu4' }}
+            has-gpu: ${{ inputs.device == 'gpu' }}
+            enabled: true
+
+          - name: mypy
+            options: mypy
+            runner: linux-amd64-cpu4
+            has-gpu: false
+            enabled: true
+
+          - name: documentation
+            options: docs
+            runner: linux-amd64-32cpu
+            has-gpu: false
+            enabled: ${{ inputs.device == 'gpu' }}
 
-    # This ensures the cleanup job runs even if previous jobs fail or the workflow is cancelled.
-    if: always()
     uses:
-      ./.github/workflows/gh-cleanup.yml
+      ./.github/workflows/gh-test.yml
     with:
-      build-target: ${{ inputs.build-target }}
-      sha: ${{ inputs.sha }}
+      name: ${{ matrix.name }}
+      device: ${{ inputs.device }}
+      image: ${{ inputs.image }}
+      runs-on: ${{ matrix.runner }}
+      has-gpu: ${{ matrix.has-gpu }}
+      test-options: ${{ matrix.options }}
+      enabled: ${{ matrix.enabled }}
diff --git a/.github/workflows/gh-build.yml b/.github/workflows/gh-build.yml
index c84ac0b9a8..308b5f78c2 100644
--- a/.github/workflows/gh-build.yml
+++ b/.github/workflows/gh-build.yml
@@ -1,123 +1,101 @@
-name: Build cunumeric on GH
+name: Build
 
 on:
   workflow_call:
     inputs:
-      build-target:
-        required: true
+      image:
         type: string
-      runs-on:
+        required: true
+      device:
         required: true
         type: string
-      sha:
+      runs-on:
         required: true
         type: string
 
-env:
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  BASE_IMAGE: rapidsai/devcontainers:23.06-cpp-cuda11.8-mambaforge-ubuntu22.04
-  IMAGE_NAME_LEGATE: legate.core-${{ inputs.build-target }}
-  IMAGE_NAME_CUNUMERIC: cunumeric-${{ inputs.build-target }}
-  USE_CUDA: ${{ (inputs.build-target == 'cpu' && 'OFF') || 'ON' }}
-
 jobs:
   build:
-    name: build-${{ inputs.build-target }}-sub-workflow
+    name: build-${{ inputs.device }}-sub-workflow
 
     permissions:
       id-token: write # This is required for configure-aws-credentials
       contents: read  # This is required for actions/checkout
-      packages: write # This is required to push docker image to ghcr.io
-
+      
     runs-on: ${{ inputs.runs-on }}
 
-    steps:
-      - name: Checkout legate.core
-        uses: actions/checkout@v3
-        with:
-          repository: nv-legate/legate.core
-          fetch-depth: 0
-          path: legate
+    container:
+      options: -u root
+      image: "${{ inputs.image }}"
+      env:
+        CUDA_VERSION: "12.0"
+        CUDA_VERSION_MAJOR: "12"
+        CUDA_VERSION_MINOR: "0"
+        SCCACHE_REGION: "us-east-2"
+        SCCACHE_BUCKET: "rapids-sccache-devs"
+        SCCACHE_S3_KEY_PREFIX: "legate-cunumeric-dev"
+        USE_CUDA: "${{ inputs.device == 'gpu' && 'ON' || 'OFF' }}"
+        GH_TOKEN: "${{ env.GH_TOKEN }}"
+        GITHUB_TOKEN: "${{ env.GITHUB_TOKEN }}"
+        VAULT_HOST: "${{ github.repository_owner != 'nv-legate' && 'https://vault.ops.k8s.rapids.ai' || '' }}"
+    defaults:
+      run:
+        shell: su coder {0}
+        working-directory: /home/coder
 
+    steps:
       - name: Checkout cunumeric (= this repo)
         uses: actions/checkout@v3
         with:
           fetch-depth: 0
           path: cunumeric
+          persist-credentials: false
 
-      - if: github.repository_owner == 'nv-legate'
-        name: Get AWS credentials for sccache bucket
-        uses: aws-actions/configure-aws-credentials@v2
-        with:
-          aws-region: us-east-2
-          role-duration-seconds: 28800 # 8 hours
-          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-nv-legate
-
-      - name: Docker system prune
+      - name: Dump environment
         run: |
-          docker version
-          docker system prune --all --force
+          env
 
-      - name: Build legate.core using docker build
+      - name: Copy source folder
         run: |
-          echo BUILD_TARGET: ${{ inputs.build-target }}
-          echo USE_CUDA: ${{ env.USE_CUDA }}
-
-          export LEGATE_SHA=$(cat cunumeric/cmake/versions.json | jq -r '.packages.legate_core.git_tag')
-          echo "Checking out LEGATE_SHA: ${LEGATE_SHA}"
-          git -C legate checkout $LEGATE_SHA
-
-          IMAGE_TAG_LEGATE=${{ env.IMAGE_NAME_LEGATE }}:${{ inputs.sha }}
-
-          chmod +x legate/continuous_integration/build-docker-image
-          legate/continuous_integration/build-docker-image \
-            --base-image "$BASE_IMAGE" \
-            --image-tag "$IMAGE_TAG_LEGATE" \
-            --source-dir legate
-
-      - name: Build cunumeric using docker build
-        run: |
-          IMAGE_TAG_CUNUMERIC=${{ env.IMAGE_NAME_CUNUMERIC }}:${{ inputs.sha }}
-          IMAGE_TAG_LEGATE=${{ env.IMAGE_NAME_LEGATE }}:${{ inputs.sha }}
-
-          legate/continuous_integration/build-docker-image \
-            --base-image "$IMAGE_TAG_LEGATE" \
-            --image-tag "$IMAGE_TAG_CUNUMERIC" \
-            --source-dir cunumeric
-
-      - name: Dump docker history of image before upload
+          set -x
+          pwd
+          cp -r $GITHUB_WORKSPACE/cunumeric .
+          chown -R coder:coder cunumeric;
+          ls -R
+
+      - name: Copy .gitconfig
+        run: cp ~/cunumeric/continuous_integration/dot-gitconfig ~/.gitconfig
+
+      - id: legate_core_info
+        name: Read legate.core SHA
+        shell: bash --noprofile --norc -xeo pipefail {0}
         run: |
-          IMAGE_TAG=${{ env.IMAGE_NAME_CUNUMERIC }}:${{ inputs.sha }}
-          docker history $IMAGE_TAG
-
-      - name: Log in to container image registry
-        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $ --password-stdin
-
-      - name: Push cunumeric image
-        run: |
-          IMAGE_TAG=${{ env.IMAGE_NAME_CUNUMERIC }}:${{ inputs.sha }}
-
-          IMAGE_ID=ghcr.io/${{ github.repository_owner }}
+          git_tag="$(jq -r '.packages.legate_core.git_tag' cunumeric/cmake/versions.json)";
 
-          # Change all uppercase to lowercase
-          IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
+          echo "git_tag=$git_tag" | tee -a "${GITHUB_OUTPUT}";
 
-          IMAGE_ID=$IMAGE_ID/$IMAGE_TAG
+      - name: Download dependencies (artifacts)
+        uses: ./cunumeric/.github/actions/download-artifacts
+        with:
+          device: "${{ inputs.device }}"
+          git_sha: "${{ steps.legate_core_info.outputs.git_tag }}"
 
-          docker tag $IMAGE_TAG $IMAGE_ID
-          docker push $IMAGE_ID
+      - if: github.repository_owner == 'nv-legate'
+        name: Get AWS credentials for sccache bucket
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          aws-region: us-east-2
+          role-duration-seconds: 28800 # 8 hours
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-nv-legate
 
-      - name: Copy artifacts back to the host
+      - name: Build cunumeric
         run: |
-          IMAGE_TAG=${{ env.IMAGE_NAME_CUNUMERIC }}:${{ inputs.sha }}
-          mkdir -p artifacts
-          docker run  -v "$(pwd)/artifacts:/home/coder/.artifacts" --rm -t $IMAGE_TAG copy-artifacts
-
-      - name: Display structure of workdir
-        run: ls -R
+          export PATH="/home/coder/cunumeric/continuous_integration/scripts:$PATH"
+          build-cunumeric-all
 
       - name: Upload build artifacts
         uses: actions/upload-artifact@v3
         with:
-          name: "cunumeric-${{ inputs.build-target }}-${{ inputs.sha }}"
-          path: artifacts
+          name: "cunumeric-${{ inputs.device }}-${{ github.sha }}"
+          path: |
+            /tmp/out
+            /tmp/conda-build
diff --git a/.github/workflows/gh-cleanup.yml b/.github/workflows/gh-cleanup.yml
deleted file mode 100644
index 6451c401c3..0000000000
--- a/.github/workflows/gh-cleanup.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-name: Clean up
-
-on:
-  workflow_call:
-    inputs:
-      build-target:
-        required: true
-        type: string
-      sha:
-        required: true
-        type: string
-
-env:
-  IMAGE_NAME: cunumeric-${{ inputs.build-target }}
-
-jobs:
-  cleanup:
-    permissions:
-      packages: write
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Delete docker image
-        run: |
-          set -xeuo pipefail
-
-          PACKAGE_NAME=${{ env.IMAGE_NAME }}
-          PACKAGE_VERSION_ID=$(
-            curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ github.token }}"\
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/orgs/${{ github.repository_owner }}/packages/container/$PACKAGE_NAME/versions |
-            jq '.[] | select(.metadata.container.tags[] == "${{ inputs.sha }}") | .id' -
-          )
-
-          curl -L \
-            -X DELETE \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ github.token }}"\
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            https://api.github.com/orgs/${{ github.repository_owner }}/packages/container/$PACKAGE_NAME/versions/$PACKAGE_VERSION_ID
diff --git a/.github/workflows/gh-test.yml b/.github/workflows/gh-test.yml
new file mode 100644
index 0000000000..675f27e9ba
--- /dev/null
+++ b/.github/workflows/gh-test.yml
@@ -0,0 +1,91 @@
+name: Test cunumeric on GH
+
+on:
+  workflow_call:
+    inputs:
+      name:
+        required: true
+        type: string
+      image:
+        type: string
+        required: true
+      device:
+        required: true
+        type: string
+      runs-on:
+        required: true
+        type: string
+      has-gpu:
+        required: true
+        type: boolean
+        description: "The runner has GPU(s)."
+      test-options:
+        required: true
+        type: string
+      enabled:
+        required: true
+        type: boolean
+
+env:
+  build_artifact_name: "cunumeric-${{ inputs.device }}-${{ github.sha }}"
+
+jobs:
+  test:
+    name: ${{ inputs.name }}
+    if: inputs.enabled && github.repository_owner == 'nv-legate'
+    runs-on: ${{ inputs.runs-on }}
+
+    container:
+      options: -u root
+      image: "${{ inputs.image }}"
+      env:
+        # CUDA_VERSION: "${{ inputs.CUDA }}"
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+
+    defaults:
+      run:
+        shell: su coder {0}
+        working-directory: /home/coder
+
+    steps:
+      - if: inputs.has-gpu
+        name: Run nvidia-smi to make sure GPU is working
+        run: nvidia-smi
+
+      - name: Install numactl
+        run: |
+          export DEBIAN_FRONTEND=noninteractive && \
+            sudo apt-get update && \
+            sudo apt-get install -y numactl
+
+      - name: Checkout cunumeric
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          path: cunumeric
+          persist-credentials: false
+
+      - name: Copy source folder
+        run: |
+          set -x
+          pwd
+          cp -r $GITHUB_WORKSPACE/cunumeric .
+          chown -R coder:coder cunumeric;
+          ls -R
+
+      - name: Download build artifacts
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ env.build_artifact_name }}
+          path: /home/coder/.artifacts
+
+      - name: Run cunumeric test / analysis
+        shell: su coder {0}
+        run: |
+          set -x
+          sudo chown -R coder:coder /home/coder/.artifacts
+
+          export PATH="/home/coder/cunumeric/continuous_integration/scripts:$PATH"
+
+          set -eo pipefail
+          test-cunumeric ${{ inputs.test-options }}
diff --git a/cmake/versions.json b/cmake/versions.json
index d1ae134aba..7a98894dd5 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -5,7 +5,7 @@
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "14cca04834095553e4d88f503dc4cd35e4072212"
+      "git_tag" : "06b0e4d7fded0b4207fd8b4ba34c330333ee3543"
     }
   }
 }
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 81569e3a24..c652d931bf 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -10,7 +10,7 @@
 ## The placeholder version is strictly for making two-pass conda build process.
 ## It should not be used for any other purpose, and this is not a default version.
 {% set placeholder_version = '0.0.0.dev' %}
-{% set default_cuda_version = '11.8' %}
+{% set default_cuda_version = '12.0' %}
 {% set cuda_version='.'.join(environ.get('CUDA', default_cuda_version).split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0]|int %}
 {% set py_version=environ.get('CONDA_PY', '') %}
@@ -138,7 +138,7 @@ requirements:
     - cuda-version >={{ cuda_version }},<{{ cuda_major+1 }}
     - cutensor >=1.3 =*_*
     - libcublas
-    - libcusolver =11.4.1.48-0
+    - libcusolver >=11.4.1.48-0
     - libcufft
 {% endif %}
     - opt_einsum >=3.3
diff --git a/continuous_integration/Dockerfile b/continuous_integration/Dockerfile
deleted file mode 100644
index 4e6478d143..0000000000
--- a/continuous_integration/Dockerfile
+++ /dev/null
@@ -1,44 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE} as stage0
-
-COPY --chown=coder:coder continuous_integration/home/coder/.local/bin/* /home/coder/.local/bin/
-COPY --chown=coder:coder . /home/coder/cunumeric
-
-RUN chmod a+x /home/coder/.local/bin/*
-
-#---------------------------------------------------
-FROM stage0 as setup
-
-USER coder
-WORKDIR /home/coder
-
-RUN set -x && . conda-utils && \
-    get_yaml_and_make_conda_env && \
-    install_legate_core_with_war
-
-#---------------------------------------------------
-FROM setup as build
-USER coder
-WORKDIR /home/coder
-
-ARG GITHUB_TOKEN
-ENV GITHUB_TOKEN=${GITHUB_TOKEN}
-ARG AWS_SESSION_TOKEN
-ENV AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
-ARG AWS_ACCESS_KEY_ID
-ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
-ARG AWS_SECRET_ACCESS_KEY
-ENV AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
-
-COPY --chown=coder:coder .creds /run/secrets
-
-RUN entrypoint build-cunumeric-all
-
-#---------------------------------------------------
-FROM stage0 as final
-USER coder
-WORKDIR /home/coder
-
-COPY --from=build --chown=coder:coder /tmp/out /tmp/out
-COPY --from=build --chown=coder:coder /tmp/conda-build /tmp/conda-build
-COPY --from=build --chown=coder:coder /tmp/env_yaml /tmp/env_yaml
diff --git a/continuous_integration/dot-gitconfig b/continuous_integration/dot-gitconfig
new file mode 100644
index 0000000000..91ac79c701
--- /dev/null
+++ b/continuous_integration/dot-gitconfig
@@ -0,0 +1,3 @@
+[user]
+	email = users.noreply.github.com
+	name = anon
\ No newline at end of file
diff --git a/continuous_integration/home/coder/.local/bin/build-cunumeric-all b/continuous_integration/home/coder/.local/bin/build-cunumeric-all
deleted file mode 100644
index 62c6da0d7e..0000000000
--- a/continuous_integration/home/coder/.local/bin/build-cunumeric-all
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-
-
-build_cunumeric_all() {
-    set -x
-    cd ~/;
-
-    conda info
-
-    set -euo pipefail;
-
-    build-cunumeric-cpp;
-    build-cunumeric-wheel;
-    build-cunumeric-conda;
-}
-
-(build_cunumeric_all "$@");
diff --git a/continuous_integration/scripts/build-cunumeric-all b/continuous_integration/scripts/build-cunumeric-all
new file mode 100755
index 0000000000..bcdbf62ec5
--- /dev/null
+++ b/continuous_integration/scripts/build-cunumeric-all
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+setup_env() {
+    yaml_file=$(find ~/.artifacts -name "environment*.yaml" | head -n 1)
+
+    [ "${USE_CUDA:-}" = "ON" ] &&
+        echo "  - libcublas-dev" >> "${yaml_file}" &&
+        echo "  - libcufft-dev" >> "${yaml_file}" &&
+        echo "  - libcurand-dev" >> "${yaml_file}" &&
+        echo "  - libcusolver-dev" >> "${yaml_file}";
+
+    echo "YAML file..."
+    cat "${yaml_file}"
+
+    mkdir -p /tmp/out;
+
+    cp "${yaml_file}" /tmp/out
+
+    mamba env create -n legate -f "$yaml_file"
+
+    mamba uninstall -yn legate numpy
+
+    mamba install -yn legate -c ~/.artifacts/legate_core -c conda-forge -c nvidia legate-core
+
+    mamba activate legate
+}
+
+build_cunumeric_all() {
+    set -xeo pipefail
+
+    setup_env;
+    cd ~/cunumeric;
+    conda info;
+
+    set -xeuo pipefail;
+    printf "\n\n\n\n********* BUILDING CUNUMERIC CPP *********\n"
+    build-cunumeric-cpp;
+    printf "\n\n\n\n********* BUILDING CUNUMERIC WHEEL *********\n"
+    build-cunumeric-wheel;
+    printf "\n\n\n\n********* BUILDING CUNUMERIC CONDA *********\n"
+    build-cunumeric-conda;
+}
+
+(build_cunumeric_all "$@");
diff --git a/continuous_integration/home/coder/.local/bin/build-cunumeric-conda b/continuous_integration/scripts/build-cunumeric-conda
similarity index 86%
rename from continuous_integration/home/coder/.local/bin/build-cunumeric-conda
rename to continuous_integration/scripts/build-cunumeric-conda
index 0be424252d..ee4efefcb2 100755
--- a/continuous_integration/home/coder/.local/bin/build-cunumeric-conda
+++ b/continuous_integration/scripts/build-cunumeric-conda
@@ -9,6 +9,9 @@ build_cunumeric_conda_package() {
         python_version="$(python3 --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f3 --complement)";
     fi
 
+    mkdir -p /tmp/conda-build /tmp/out
+    cp -r ~/.artifacts/legate_core /tmp/conda-build/
+
     local conda_build_args=();
     conda_build_args+=(--override-channels);
     conda_build_args+=(-c conda-forge);
@@ -34,7 +37,21 @@ build_cunumeric_conda_package() {
 
     # Synthesize new cunumeric conda-build build.sh script
 
-    cat <<EOF >> ~/cunumeric/conda/conda-build/conda_build_config.yaml
+    cat <<EOF > ~/cunumeric/conda/conda-build/conda_build_config.yaml
+gpu_enabled:
+  - "${GPU_ENABLED}"
+
+python:
+  - "${python_version}"
+
+numpy_version:
+  - ">=1.22"
+
+cmake_version:
+  - ">=3.20.1,!=3.23.0"
+
+use_local_path:
+  - "true"
 
 numpy:
   - 1.22
diff --git a/continuous_integration/home/coder/.local/bin/build-cunumeric-cpp b/continuous_integration/scripts/build-cunumeric-cpp
similarity index 93%
rename from continuous_integration/home/coder/.local/bin/build-cunumeric-cpp
rename to continuous_integration/scripts/build-cunumeric-cpp
index 83f6dcd8c7..fd08ceac2f 100755
--- a/continuous_integration/home/coder/.local/bin/build-cunumeric-cpp
+++ b/continuous_integration/scripts/build-cunumeric-cpp
@@ -9,7 +9,6 @@ build_cunumeric_cpp() {
     cmake_args+=(-DBUILD_MARCH=${BUILD_MARCH:-haswell});
     cmake_args+=(-DCMAKE_BUILD_TYPE=Release);
     cmake_args+=(-DCMAKE_CUDA_ARCHITECTURES=RAPIDS);
-    cmake_args+=(-Dlegate_core_ROOT=$HOME/legate/build);
     cmake_args+=(-DCMAKE_BUILD_PARALLEL_LEVEL=${JOBS:-$(nproc --ignore=1)});
     cmake_args+=(${@});
 
diff --git a/continuous_integration/home/coder/.local/bin/build-cunumeric-wheel b/continuous_integration/scripts/build-cunumeric-wheel
similarity index 100%
rename from continuous_integration/home/coder/.local/bin/build-cunumeric-wheel
rename to continuous_integration/scripts/build-cunumeric-wheel
diff --git a/continuous_integration/scripts/entrypoint b/continuous_integration/scripts/entrypoint
new file mode 100755
index 0000000000..298fc1c7a1
--- /dev/null
+++ b/continuous_integration/scripts/entrypoint
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+sccache_stop_server_and_show_stats() {
+    sccache --stop-server || true && sccache --show-stats;
+}
+
+init_devcontainer() {
+    # disable xtrace and history
+    local xtrace_enabled=$(echo "${SHELLOPTS:-}" | grep -q 'xtrace'; echo $?);
+    local history_enabled=$(echo "${SHELLOPTS:-}" | grep -q 'history'; echo $?);
+    { set +xo history; } 2>/dev/null;
+    eval "export $(find /run/secrets/ -type f -exec bash -c 'echo ${0/\/run\/secrets\//}=$(<${0})' {} \;)";
+    if [ "${history_enabled}" -eq "0" ]; then { set -o history; } 2>/dev/null; fi;
+    if [ "${xtrace_enabled}" -eq "0" ]; then { set -o xtrace; } 2>/dev/null; fi;
+
+    set -xeo pipefail
+
+    . devcontainer-utils-post-attach-command;
+
+    sleep 10;
+    . devcontainer-utils-vault-s3-test;
+    . devcontainer-utils-vault-s3-export 0;
+}
+
+entrypoint() {
+    set -x
+
+    mkdir -p /home/coder/.cache;
+
+    local secrets_dir=/run/secrets
+
+    if [ -d "$secrets_dir" ] && [ "$(ls -A $secrets_dir)" ]; then
+        init_devcontainer
+    else
+        sccache_stop_server_and_show_stats
+    fi
+
+    echo AWS_REGION=${AWS_REGION:-}
+    echo AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN:-}
+    echo AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
+    echo AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
+
+    exec "$@";
+}
+
+entrypoint "$@";
diff --git a/continuous_integration/scripts/test-cunumeric b/continuous_integration/scripts/test-cunumeric
new file mode 100755
index 0000000000..ca57b42e97
--- /dev/null
+++ b/continuous_integration/scripts/test-cunumeric
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+setup_env() {
+    mamba create -yn legate -c ~/.artifacts/conda-build/legate_core -c ~/.artifacts/conda-build/cunumeric -c conda-forge -c "nvidia/label/cuda-12.0.0" legate-core cunumeric
+}
+
+setup_test_env() {
+    mamba install -y "clang-tools>=8" "clang>=8" colorama coverage mock pre-commit pytest-cov pytest-lazy-fixture pytest-mock pytest types-docutils pynvml
+
+    pip install tifffile
+}
+
+setup_docs_env() {
+    mamba install -y pandoc doxygen
+    pip install ipython jinja2 "markdown<3.4.0" "pydata-sphinx-theme>=0.13"  myst-parser  nbsphinx  sphinx-copybutton  "sphinx>=4.4.0"
+
+}
+
+setup_mypy_env() {
+    mamba install -y "mypy>=0.961" jinja2 nbsphinx sphinx-copybutton  "sphinx>=4.4.0" types-docutils
+}
+
+test-cunumeric() {
+    set -xeo pipefail
+
+    setup_env;
+
+    set +u
+    mamba activate legate;
+    conda info;
+
+    cd ~/cunumeric;
+
+    case "$1" in
+        "test")
+            echo "Executing tests..."
+            shift;
+            setup_test_env;
+            ./test.py --verbose "$@"
+            ;;
+        "mypy")
+            echo "Installing and executing mypy..."
+            shift;
+            setup_mypy_env;
+            mypy cunumeric
+            ;;
+        "docs")
+            echo "Building docs..."
+            shift;
+            setup_docs_env;
+            cd docs/cunumeric
+            make clean html
+            ;;
+        *)
+            echo "Invalid command: $1"
+            return 1
+            ;;
+    esac
+}
+
+(test-cunumeric "$@");
\ No newline at end of file

From c813d99ec910d395b96d4b2b9c70aad0f2a66b00 Mon Sep 17 00:00:00 2001
From: Sandeep Datta <128171450+sandeepd-nv@users.noreply.github.com>
Date: Wed, 27 Sep 2023 21:47:50 +0530
Subject: [PATCH 02/18] Enable OpenMP tests in CI. (#1051)

* Enable OpenMP tests in CI.

* Removed ci.yml.
---
 .github/workflows/ci.yml                | 136 ------------------------
 .github/workflows/gh-build-and-test.yml |   4 +-
 2 files changed, 2 insertions(+), 138 deletions(-)
 delete mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index c147cd17da..0000000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: Build cunumeric
-on:
-  push:
-    branches-ignore:
-      - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
-  pull_request:
-    branches-ignore:
-      - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 */6 * * *'
-env:
-  COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
-  PROJECT: github-cunumeric-ci
-  REF: ${{ github.event.pull_request.head.ref || github.ref }}
-  BASE_REF: ${{ github.event.pull_request.base.ref || github.ref }}
-  EVENT_NAME: ${{ github.event_name }}
-  LABEL: ${{ github.event.pull_request.head.label }}
-  # Prevent output buffering
-  PYTHONUNBUFFERED: 1
-jobs:
-  build:
-    if: ${{ github.repository == 'nv-legate/cunumeric' }}
-    runs-on: self-hosted
-    steps:
-      - name: Dump GitHub context
-        env:
-          GITHUB_CONTEXT: ${{ toJSON(github) }}
-        run: echo "$GITHUB_CONTEXT"
-      - name: Dump job context
-        env:
-          JOB_CONTEXT: ${{ toJSON(job) }}
-        run: echo "$JOB_CONTEXT"
-      - name: Dump steps context
-        env:
-          STEPS_CONTEXT: ${{ toJSON(steps) }}
-        run: echo "$STEPS_CONTEXT"
-      - name: Dump runner context
-        env:
-          RUNNER_CONTEXT: ${{ toJSON(runner) }}
-        run: echo "$RUNNER_CONTEXT"
-      - name: Dump strategy context
-        env:
-          STRATEGY_CONTEXT: ${{ toJSON(strategy) }}
-        run: echo "$STRATEGY_CONTEXT"
-      - name: Dump matrix context
-        env:
-          MATRIX_CONTEXT: ${{ toJSON(matrix) }}
-        run: echo "$MATRIX_CONTEXT"
-      - name: Run CI build
-        run: |
-          /data/github-runner/legate-bin/setup.sh
-          cd legate-ci/github-ci/cunumeric
-          rm -rf ngc-artifacts || true
-          ./build-separate.sh > ${COMMIT}-build.log 2>&1
-      - name: Process Output
-        run: |
-          cd legate-ci/github-ci/cunumeric
-          cat *artifacts/*/*
-        if: always()
-      - name: Upload Build Log
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: build-log
-          path: ./**/${{ env.COMMIT }}-build.log.gpg
-  test:
-    if: ${{ github.repository == 'nv-legate/cunumeric' }}
-    runs-on: self-hosted
-    needs: build
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - {name: 1 CPU test, options: --cpus 1 --unit --debug, log: cpu}
-          - {name: 2 CPUs test, options: --cpus 2 --debug, log: cpus}
-          - {name: GPU test, options: --use cuda --gpus 1 --debug, log: gpu}
-          - {name: 2 GPUs test, options: --use cuda --gpus 2 --debug, log: gpus}
-          - {name: OpenMP test, options: --use openmp --omps 1 --ompthreads 2 --debug, log: omp}
-          - {name: 2 NUMA OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --numamem 2048 --debug, log: omps}
-          - {name: Eager execution test, options: --use eager --debug, log: eager}
-          - {name: mypy, options: mypy, log: mypy}
-          - {name: documentation, options: docs, log: docs}
-    name: ${{ matrix.name }}
-    steps:
-      - name: Dump GitHub context
-        env:
-          GITHUB_CONTEXT: ${{ toJSON(github) }}
-        run: echo "$GITHUB_CONTEXT"
-      - name: Dump job context
-        env:
-          JOB_CONTEXT: ${{ toJSON(job) }}
-        run: echo "$JOB_CONTEXT"
-      - name: Dump steps context
-        env:
-          STEPS_CONTEXT: ${{ toJSON(steps) }}
-        run: echo "$STEPS_CONTEXT"
-      - name: Dump runner context
-        env:
-          RUNNER_CONTEXT: ${{ toJSON(runner) }}
-        run: echo "$RUNNER_CONTEXT"
-      - name: Dump strategy context
-        env:
-          STRATEGY_CONTEXT: ${{ toJSON(strategy) }}
-        run: echo "$STRATEGY_CONTEXT"
-      - name: Dump matrix context
-        env:
-          MATRIX_CONTEXT: ${{ toJSON(matrix) }}
-        run: echo "$MATRIX_CONTEXT"
-      - name: Prepare
-        run: |
-          /data/github-runner/legate-bin/setup.sh
-          cd legate-ci/github-ci/cunumeric
-          if [[ ! -d ngc-artifacts ]]
-          then
-            mkdir ngc-artifacts
-          else
-            rm -rf ngc-artifacts/*
-          fi
-      - name: Test
-        run: |
-          cd legate-ci/github-ci/cunumeric
-          [[ "${{ matrix.name }}" == "Eager"* ]] && export PYTHONFAULTHANDLER=1
-          ./test.sh ${{ matrix.options }} > ${COMMIT}-test-${{ matrix.log }}.log 2>&1
-      - name: Process output
-        if: always()
-        run: |
-          cd legate-ci/github-ci/cunumeric
-          /data/github-runner/legate-bin/encrypt.sh ${COMMIT}-test-${{ matrix.log }}.log
-          cat *artifacts/*/*
-      - name: Upload Log
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: test-${{ matrix.log }}-log
-          path: ./**/${{ env.COMMIT }}-test-${{ matrix.log }}.log.gpg
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index e0e87899da..3766a07ee6 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -54,13 +54,13 @@ jobs:
             options: test --use openmp --omps 1 --ompthreads 2 --debug
             runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-32cpu' }}
             has-gpu: ${{ inputs.device == 'gpu' }}
-            enabled: false
+            enabled: true
 
           - name: 2 NUMA OpenMPs test
             options: test --use openmp --omps 2 --ompthreads 2 --numamem 2048 --debug
             runner: ${{ inputs.device == 'gpu' && 'linux-amd64-gpu-v100-latest-1' || 'linux-amd64-32cpu' }}
             has-gpu: ${{ inputs.device == 'gpu' }}
-            enabled: false
+            enabled: true
 
           - name: Eager execution test
             options: test --use eager --debug

From 45e76b26baadf332c4a8036773f2968e08dd8189 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <mzalewski@nvidia.com>
Date: Fri, 29 Sep 2023 08:51:18 -0700
Subject: [PATCH 03/18] Update version to 23.11 (#1052)

* Update version

* Update legion version
---
 CMakeLists.txt      | 2 +-
 cmake/versions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68dfe83f6f..5f1c6a8581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,7 +55,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(cunumeric_version 23.09.00)
+set(cunumeric_version 23.11.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes
diff --git a/cmake/versions.json b/cmake/versions.json
index 7a98894dd5..b99da26cb4 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -1,11 +1,11 @@
 {
   "packages" : {
     "legate_core" : {
-      "version": "23.09.00",
+      "version": "23.11.00",
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "06b0e4d7fded0b4207fd8b4ba34c330333ee3543"
+      "git_tag" : "8997f997be02936304b3ac23fe785f1de7a3424b"
     }
   }
 }

From 233ef08480187deb2b5f013b7c1f89341ae454e9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 3 Oct 2023 14:30:41 -0700
Subject: [PATCH 04/18] replacing set with OrderedSet to avoid
 control-replication violations (#1054)

replacing set with OrderedSet to avoid control-replication violations
---
 cunumeric/_ufunc/ufunc.py               |  3 ++-
 cunumeric/array.py                      | 10 +++++-----
 cunumeric/coverage.py                   |  5 +++--
 cunumeric/deferred.py                   | 17 +++++++++--------
 cunumeric/utils.py                      | 15 +++++++++++----
 tests/integration/test_einsum.py        | 13 ++++++++-----
 tests/integration/utils/contractions.py |  8 ++++++--
 7 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 3079f32616..11800e53f2 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -17,6 +17,7 @@
 from typing import TYPE_CHECKING, Any, Dict, Sequence, Union
 
 import numpy as np
+from legate.core.utils import OrderedSet
 
 from ..array import check_writeable, convert_to_cunumeric_ndarray, ndarray
 from ..config import BinaryOpCode, UnaryOpCode, UnaryRedCode
@@ -552,7 +553,7 @@ def _find_common_type(
         arrs: Sequence[ndarray], orig_args: Sequence[Any]
     ) -> np.dtype[Any]:
         all_ndarray = all(isinstance(arg, ndarray) for arg in orig_args)
-        unique_dtypes = set(arr.dtype for arr in arrs)
+        unique_dtypes = OrderedSet(arr.dtype for arr in arrs)
         # If all operands are ndarrays and they all have the same dtype,
         # we already know the common dtype
         if len(unique_dtypes) == 1 and all_ndarray:
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 9a172305c1..dd7079fa30 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -25,7 +25,6 @@
     Literal,
     Optional,
     Sequence,
-    Set,
     TypeVar,
     Union,
     cast,
@@ -33,6 +32,7 @@
 
 import numpy as np
 from legate.core import Array, Field
+from legate.core.utils import OrderedSet
 from numpy.core.multiarray import (  # type: ignore [attr-defined]
     normalize_axis_index,
 )
@@ -90,7 +90,7 @@ def add_boilerplate(
       parameter (if present), to cuNumeric ndarrays.
     * Convert the special "where" parameter (if present) to a valid predicate.
     """
-    keys = set(array_params)
+    keys = OrderedSet(array_params)
     assert len(keys) == len(array_params)
 
     def decorator(func: Callable[P, R]) -> Callable[P, R]:
@@ -100,11 +100,11 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
 
         # For each parameter specified by name, also consider the case where
         # it's passed as a positional parameter.
-        indices: Set[int] = set()
+        indices: OrderedSet[int] = OrderedSet()
         where_idx: Optional[int] = None
         out_idx: Optional[int] = None
         params = signature(func).parameters
-        extra = keys - set(params)
+        extra = keys - OrderedSet(params)
         assert len(extra) == 0, f"unknown parameter(s): {extra}"
         for idx, param in enumerate(params):
             if param == "where":
@@ -2435,7 +2435,7 @@ def _diag_helper(
         else:
             assert axes is not None
             N = len(axes)
-            if len(axes) != len(set(axes)):
+            if len(axes) != len(OrderedSet(axes)):
                 raise ValueError(
                     "axes passed to _diag_helper should be all different"
                 )
diff --git a/cunumeric/coverage.py b/cunumeric/coverage.py
index 55f74d238d..a8e57285f5 100644
--- a/cunumeric/coverage.py
+++ b/cunumeric/coverage.py
@@ -36,6 +36,7 @@
 )
 
 from legate.core import track_provenance
+from legate.core.utils import OrderedSet
 from typing_extensions import Protocol
 
 from .runtime import runtime
@@ -62,7 +63,7 @@ def filter_namespace(
     omit_names: Optional[Container[str]] = None,
     omit_types: tuple[type, ...] = (),
 ) -> dict[str, Any]:
-    omit_names = omit_names or set()
+    omit_names = omit_names or OrderedSet()
     return {
         attr: value
         for attr, value in ns.items()
@@ -330,7 +331,7 @@ def clone_class(
     """
 
     class_name = f"{origin_class.__module__}.{origin_class.__name__}"
-    clean_omit_names = set() if omit_names is None else omit_names
+    clean_omit_names = OrderedSet() if omit_names is None else omit_names
 
     def _clone_class(cls: type) -> type:
         missing = filter_namespace(
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5cbea74bfb..6a56f65dbe 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -36,6 +36,7 @@
 import legate.core.types as ty
 import numpy as np
 from legate.core import Annotation, Future, ReductionOp, Store
+from legate.core.utils import OrderedSet
 from numpy.core.numeric import (  # type: ignore [attr-defined]
     normalize_axis_tuple,
 )
@@ -96,7 +97,7 @@ def auto_convert(
     """
     Converts all named parameters to DeferredArrays.
     """
-    keys = set(thunk_params)
+    keys = OrderedSet(thunk_params)
     assert len(keys) == len(thunk_params)
 
     def decorator(func: Callable[P, R]) -> Callable[P, R]:
@@ -107,7 +108,7 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
         # For each parameter specified by name, also consider the case where
         # it's passed as a positional parameter.
         params = signature(func).parameters
-        extra = keys - set(params)
+        extra = keys - OrderedSet(params)
         assert len(extra) == 0, f"unknown parameter(s): {extra}"
         indices = {idx for (idx, param) in enumerate(params) if param in keys}
 
@@ -1429,7 +1430,7 @@ def fft(
             task.add_scalar_arg(kind.type_id, ty.int32)
             task.add_scalar_arg(direction.value, ty.int32)
             task.add_scalar_arg(
-                len(set(axes)) != len(axes)
+                len(OrderedSet(axes)) != len(axes)
                 or len(axes) != input.ndim
                 or tuple(axes) != tuple(sorted(axes)),
                 ty.bool_,
@@ -1437,8 +1438,8 @@ def fft(
             for ax in axes:
                 task.add_scalar_arg(ax, ty.int64)
 
-            if input.ndim > len(set(axes)):
-                task.add_broadcast(input, axes=set(axes))
+            if input.ndim > len(OrderedSet(axes)):
+                task.add_broadcast(input, axes=OrderedSet(axes))
             else:
                 task.add_broadcast(input)
             task.add_constraint(p_output == p_input)
@@ -1502,9 +1503,9 @@ def contract(
 
         # Sanity checks
         # no duplicate modes within an array
-        assert len(lhs_modes) == len(set(lhs_modes))
-        assert len(rhs1_modes) == len(set(rhs1_modes))
-        assert len(rhs2_modes) == len(set(rhs2_modes))
+        assert len(lhs_modes) == len(OrderedSet(lhs_modes))
+        assert len(rhs1_modes) == len(OrderedSet(rhs1_modes))
+        assert len(rhs2_modes) == len(OrderedSet(rhs2_modes))
         # no singleton modes
         mode_counts: Counter[str] = Counter()
         mode_counts.update(lhs_modes)
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 55a9b8c1e8..7071545238 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -22,6 +22,7 @@
 
 import legate.core.types as ty
 import numpy as np
+from legate.core.utils import OrderedSet
 
 from .types import NdShape
 
@@ -194,8 +195,8 @@ def check_axes(a_axes: Axes, b_axes: Axes) -> None:
             len(a_axes) != len(b_axes)
             or len(a_axes) > a_ndim
             or len(b_axes) > b_ndim
-            or len(a_axes) != len(set(a_axes))
-            or len(b_axes) != len(set(b_axes))
+            or len(a_axes) != len(OrderedSet(a_axes))
+            or len(b_axes) != len(OrderedSet(b_axes))
             or any(ax < 0 for ax in a_axes)
             or any(ax < 0 for ax in b_axes)
             or any(ax >= a_ndim for ax in a_axes)
@@ -211,8 +212,14 @@ def check_axes(a_axes: Axes, b_axes: Axes) -> None:
     b_modes = list(ascii_uppercase[:b_ndim])
     for a_i, b_i in zip(a_axes, b_axes):
         b_modes[b_i] = a_modes[a_i]
-    a_out = [a_modes[a_i] for a_i in sorted(set(range(a_ndim)) - set(a_axes))]
-    b_out = [b_modes[b_i] for b_i in sorted(set(range(b_ndim)) - set(b_axes))]
+    a_out = [
+        a_modes[a_i]
+        for a_i in sorted(OrderedSet(range(a_ndim)) - OrderedSet(a_axes))
+    ]
+    b_out = [
+        b_modes[b_i]
+        for b_i in sorted(OrderedSet(range(b_ndim)) - OrderedSet(b_axes))
+    ]
 
     return (a_modes, b_modes, a_out + b_out)
 
diff --git a/tests/integration/test_einsum.py b/tests/integration/test_einsum.py
index 96492b7239..e482e8cf09 100644
--- a/tests/integration/test_einsum.py
+++ b/tests/integration/test_einsum.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import pytest
+from legate.core.utils import OrderedSet
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array, permutes_to
 
@@ -54,8 +55,8 @@ def gen_operand(
         return
     # If we've hit the limit on distinct modes, only use modes
     # appearing on the same operand
-    if len(op) == dim_lim - 1 and len(set(op)) >= mode_lim:
-        for m in sorted(set(op)):
+    if len(op) == dim_lim - 1 and len(OrderedSet(op)) >= mode_lim:
+        for m in sorted(OrderedSet(op)):
             op.append(m)
             yield from gen_operand(used_modes, dim_lim, mode_lim, op)
             op.pop()
@@ -82,7 +83,7 @@ def gen_expr(
     if opers is None:
         opers = []
     if cache is None:
-        cache = set()
+        cache = OrderedSet()
     # The goal here is to avoid producing duplicate expressions, up to
     # reordering of operands and alpha-renaming, e.g. the following
     # are considered equivalent (for the purposes of testing):
@@ -108,7 +109,9 @@ def gen_expr(
     dim_lim = len(opers[-1]) if len(opers) > 0 else MAX_OPERAND_DIM
     # Between operands of the same length, put those with the most distinct
     # modes first.
-    mode_lim = len(set(opers[-1])) if len(opers) > 0 else MAX_OPERAND_DIM
+    mode_lim = (
+        len(OrderedSet(opers[-1])) if len(opers) > 0 else MAX_OPERAND_DIM
+    )
     for op in gen_operand(used_modes, dim_lim, mode_lim):
         opers.append(op)
         yield from gen_expr(opers, cache)
@@ -187,7 +190,7 @@ def mk_input_that_broadcasts_to(lib, tgt_shape):
     # just one of them to 1. Consider the operation 'aab->ab': (10,10,11),
     # (10,10,1), (1,1,11), (1,1,1) are all acceptable input shapes, but
     # (1,10,11) is not.
-    tgt_sizes = list(sorted(set(tgt_shape)))
+    tgt_sizes = list(sorted(OrderedSet(tgt_shape)))
     res = []
     for mask in product([True, False], repeat=len(tgt_sizes)):
         tgt2src_size = {
diff --git a/tests/integration/utils/contractions.py b/tests/integration/utils/contractions.py
index 46886f1165..c590adbe46 100644
--- a/tests/integration/utils/contractions.py
+++ b/tests/integration/utils/contractions.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 from legate.core import LEGATE_MAX_DIM
+from legate.core.utils import OrderedSet
 
 import cunumeric as num
 
@@ -38,7 +39,7 @@ def gen_output_default(lib, modes, a, b):
 
 def gen_shapes(a_modes, b_modes):
     yield ((5,) * len(a_modes), (5,) * len(b_modes))
-    for mode_to_squeeze in set(a_modes + b_modes):
+    for mode_to_squeeze in OrderedSet(a_modes + b_modes):
         a_shape = tuple((1 if m == mode_to_squeeze else 5) for m in a_modes)
         b_shape = tuple((1 if m == mode_to_squeeze else 5) for m in b_modes)
         yield (a_shape, b_shape)
@@ -104,7 +105,10 @@ def gen_output_of_various_types(lib, modes, a, b):
 
 def _test(name, modes, operation, gen_inputs, gen_output=None, **kwargs):
     (a_modes, b_modes, out_modes) = modes
-    if len(set(a_modes) | set(b_modes) | set(out_modes)) > LEGATE_MAX_DIM:
+    if (
+        len(OrderedSet(a_modes) | OrderedSet(b_modes) | OrderedSet(out_modes))
+        > LEGATE_MAX_DIM
+    ):
         # Total number of distinct modes can't exceed maximum Legion dimension,
         # because we may need to promote arrays so that one includes all modes.
         return

From 2e520817283f63fcce3aec20fa74d0259cf9edc4 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Wed, 4 Oct 2023 08:24:59 -0700
Subject: [PATCH 05/18] Satisfy new flake8 type check rules (#1056)

---
 cunumeric/array.py                    | 4 ++--
 cunumeric/module.py                   | 4 ++--
 cunumeric/utils.py                    | 6 +++---
 tests/integration/test_prod.py        | 4 ++--
 tests/integration/test_reduction.py   | 4 ++--
 tests/integration/test_squeeze.py     | 4 ++--
 tests/unit/cunumeric/test_coverage.py | 4 ++--
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index dd7079fa30..0176798bce 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -2554,7 +2554,7 @@ def diagonal(
                 raise ValueError("extract can be true only for Ndim >=2")
             axes = None
         else:
-            if type(axis1) == int and type(axis2) == int:
+            if isinstance(axis1, int) and isinstance(axis2, int):
                 if axes is not None:
                     raise ValueError(
                         "Either axis1/axis2 or axes must be supplied"
@@ -3102,7 +3102,7 @@ def mean(
         Multiple GPUs, Multiple CPUs
 
         """
-        if axis is not None and type(axis) != int:
+        if axis is not None and not isinstance(axis, int):
             raise NotImplementedError(
                 "cunumeric.mean only supports int types for "
                 "'axis' currently"
diff --git a/cunumeric/module.py b/cunumeric/module.py
index c676dc02c5..e8d933da65 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4760,7 +4760,7 @@ def einsum_path(
     """
     computed_operands = [convert_to_cunumeric_ndarray(op) for op in operands]
     memory_limit = _builtin_max(op.size for op in computed_operands)
-    if type(optimize) == tuple:
+    if isinstance(optimize, tuple):
         if len(optimize) != 2:
             raise ValueError("einsum_path expects optimize tuples of size 2")
         optimize, memory_limit = optimize
@@ -4771,7 +4771,7 @@ def einsum_path(
     elif optimize in ["greedy", "optimal"]:
         pass
     elif (
-        type(optimize) == list
+        isinstance(optimize, list)
         and len(optimize) > 1
         and optimize[0] == "einsum_path"
     ):
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 7071545238..93e45fb740 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -234,11 +234,11 @@ def deep_apply(obj: Any, func: Callable[[Any], Any]) -> Any:
     primarily meant to be used for arguments of NumPy API calls, which
     shouldn't nest their arrays very deep.
     """
-    if type(obj) == list:
+    if isinstance(obj, list):
         return [deep_apply(x, func) for x in obj]
-    elif type(obj) == tuple:
+    elif isinstance(obj, tuple):
         return tuple(deep_apply(x, func) for x in obj)
-    elif type(obj) == dict:
+    elif isinstance(obj, dict):
         return {k: deep_apply(v, func) for k, v in obj.items()}
     else:
         return func(obj)
diff --git a/tests/integration/test_prod.py b/tests/integration/test_prod.py
index 8b627ecd12..ab0f4def8f 100644
--- a/tests/integration/test_prod.py
+++ b/tests/integration/test_prod.py
@@ -263,7 +263,7 @@ def test_out_axis(self, size):
         ndim = arr_np.ndim
         for axis in range(-ndim + 1, ndim, 1):
             out_shape = ()
-            if type(size) == tuple:
+            if isinstance(size, tuple):
                 out_shape_list = list(size)
                 del out_shape_list[axis]
                 out_shape = tuple(out_shape_list)
@@ -283,7 +283,7 @@ def test_out_axis_dtype(self, size):
         ndim = arr_np.ndim
         for axis in range(-ndim + 1, ndim, 1):
             out_shape = ()
-            if type(size) == tuple:
+            if isinstance(size, tuple):
                 out_shape_list = list(size)
                 del out_shape_list[axis]
                 out_shape = tuple(out_shape_list)
diff --git a/tests/integration/test_reduction.py b/tests/integration/test_reduction.py
index 58f133a1ff..a7a89a6af0 100644
--- a/tests/integration/test_reduction.py
+++ b/tests/integration/test_reduction.py
@@ -219,7 +219,7 @@ def test_out_axis(self, size):
         ndim = arr_np.ndim
         for axis in range(-ndim + 1, ndim, 1):
             out_shape = ()
-            if type(size) == tuple:
+            if isinstance(size, tuple):
                 out_shape_list = list(size)
                 del out_shape_list[axis]
                 out_shape = tuple(out_shape_list)
@@ -239,7 +239,7 @@ def test_out_axis_dtype(self, size):
         ndim = arr_np.ndim
         for axis in range(-ndim + 1, ndim, 1):
             out_shape = ()
-            if type(size) == tuple:
+            if isinstance(size, tuple):
                 out_shape_list = list(size)
                 del out_shape_list[axis]
                 out_shape = tuple(out_shape_list)
diff --git a/tests/integration/test_squeeze.py b/tests/integration/test_squeeze.py
index 84ac8be2e0..14c2fda0d1 100644
--- a/tests/integration/test_squeeze.py
+++ b/tests/integration/test_squeeze.py
@@ -125,7 +125,7 @@ def test_array_basic(size):
 
 
 @pytest.mark.parametrize(
-    "size", (s for s in SIZES if type(s) == tuple if 1 in s), ids=str
+    "size", (s for s in SIZES if isinstance(s, tuple) if 1 in s), ids=str
 )
 def test_num_axis(size):
     a = np.random.randint(low=-10, high=10, size=size)
@@ -139,7 +139,7 @@ def test_num_axis(size):
 
 
 @pytest.mark.parametrize(
-    "size", (s for s in SIZES if type(s) == tuple if 1 in s), ids=str
+    "size", (s for s in SIZES if isinstance(s, tuple) if 1 in s), ids=str
 )
 def test_array_axis(size):
     a = np.random.randint(low=-10, high=10, size=size)
diff --git a/tests/unit/cunumeric/test_coverage.py b/tests/unit/cunumeric/test_coverage.py
index 0ce089e094..ca683b51c5 100644
--- a/tests/unit/cunumeric/test_coverage.py
+++ b/tests/unit/cunumeric/test_coverage.py
@@ -388,8 +388,8 @@ def __array_prepare__(self):
         return "I am now ready"
 
     def foo(self, other):
-        assert type(self) == _Orig_ndarray
-        assert type(other) == _Orig_ndarray
+        assert type(self) == _Orig_ndarray  # noqa
+        assert type(other) == _Orig_ndarray  # noqa
         return "original foo"
 
     def bar(self, other):

From 74490ef5f6ef44c1eb3adf39f90fdf09912fb1e1 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 4 Oct 2023 09:32:52 -0700
Subject: [PATCH 06/18] Inline boolean operators in NumPy are bitwise, not
 logical (#1057)

* Inline boolean operators in NumPy are bitwise, not logical

* Add tests for inline operators
---
 cunumeric/array.py                     | 28 +++++------
 tests/integration/test_binary_ufunc.py | 69 +++++++++++++++++++-------
 tests/integration/test_unary_ufunc.py  | 53 ++++++++++++--------
 3 files changed, 97 insertions(+), 53 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 0176798bce..91ad41dde5 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -817,9 +817,9 @@ def __and__(self, rhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_and
+        from ._ufunc import bitwise_and
 
-        return logical_and(self, rhs)
+        return bitwise_and(self, rhs)
 
     def __array__(
         self, dtype: Union[np.dtype[Any], None] = None
@@ -1073,9 +1073,9 @@ def __iand__(self, rhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_and
+        from ._ufunc import bitwise_and
 
-        return logical_and(self, rhs, out=self)
+        return bitwise_and(self, rhs, out=self)
 
     def __idiv__(self, rhs: Any) -> ndarray:
         """a.__idiv__(value, /)
@@ -1186,9 +1186,9 @@ def __ior__(self, rhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_or
+        from ._ufunc import bitwise_or
 
-        return logical_or(self, rhs, out=self)
+        return bitwise_or(self, rhs, out=self)
 
     def __ipow__(self, rhs: float) -> ndarray:
         """a.__ipow__(/)
@@ -1260,9 +1260,9 @@ def __ixor__(self, rhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_xor
+        from ._ufunc import bitwise_xor
 
-        return logical_xor(self, rhs, out=self)
+        return bitwise_xor(self, rhs, out=self)
 
     def __le__(self, rhs: Any) -> ndarray:
         """a.__le__(value, /)
@@ -1416,9 +1416,9 @@ def __or__(self, rhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_or
+        from ._ufunc import bitwise_or
 
-        return logical_or(self, rhs)
+        return bitwise_or(self, rhs)
 
     def __pos__(self) -> ndarray:
         """a.__pos__(value, /)
@@ -1473,9 +1473,9 @@ def __rand__(self, lhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_and
+        from ._ufunc import bitwise_and
 
-        return logical_and(lhs, self)
+        return bitwise_and(lhs, self)
 
     def __rdiv__(self, lhs: Any) -> ndarray:
         """a.__rdiv__(value, /)
@@ -1584,9 +1584,9 @@ def __ror__(self, lhs: Any) -> ndarray:
         Multiple GPUs, Multiple CPUs
 
         """
-        from ._ufunc import logical_or
+        from ._ufunc import bitwise_or
 
-        return logical_or(lhs, self)
+        return bitwise_or(lhs, self)
 
     def __rpow__(self, lhs: Any) -> ndarray:
         """__rpow__(value, /)
diff --git a/tests/integration/test_binary_ufunc.py b/tests/integration/test_binary_ufunc.py
index 4d2a9b7db7..a6acef0494 100644
--- a/tests/integration/test_binary_ufunc.py
+++ b/tests/integration/test_binary_ufunc.py
@@ -41,32 +41,53 @@ def check_result(op, in_np, out_np, out_num):
 
 
 def check_ops(ops, in_np, out_dtype="D"):
+    in_num = tuple(num.array(arr) for arr in in_np)
+
     for op in ops:
-        op_np = getattr(np, op)
-        op_num = getattr(num, op)
+        if op.isidentifier():
+            op_np = getattr(np, op)
+            op_num = getattr(num, op)
+            assert op_np.nout == 1
+
+            out_np = op_np(*in_np)
+            out_num = op_num(*in_num)
+
+            check_result(op, in_np, out_np, out_num)
 
-        assert op_np.nout == 1
+            out_np = np.empty(out_np.shape, dtype=out_dtype)
+            out_num = num.empty(out_num.shape, dtype=out_dtype)
+            op_np(*in_np, out=out_np)
+            op_num(*in_num, out=out_num)
 
-        in_num = tuple(num.array(arr) for arr in in_np)
+            check_result(op, in_np, out_np, out_num)
 
-        out_np = op_np(*in_np)
-        out_num = op_num(*in_num)
+            # Ask cuNumeric to produce outputs to NumPy ndarrays
+            out_num = np.empty(out_np.shape, dtype=out_dtype)
+            op_num(*in_num, out=out_num)
 
-        check_result(op, in_np, out_np, out_num)
+            check_result(op, in_np, out_np, out_num)
+
+        else:
+            # Doing it this way instead of invoking the dunders directly, to
+            # avoid having to select the right version, __add__ vs __radd__,
+            # when one isn't supported, e.g. for scalar.__add__(array)
 
-        out_np = np.empty(out_np.shape, dtype=out_dtype)
-        out_num = num.empty(out_num.shape, dtype=out_dtype)
+            out_np = eval(f"in_np[0] {op} in_np[1]")
+            out_num = eval(f"in_num[0] {op} in_num[1]")
 
-        op_np(*in_np, out=out_np)
-        op_num(*in_num, out=out_num)
+            check_result(op, in_np, out_np, out_num)
 
-        check_result(op, in_np, out_np, out_num)
+            out_np = np.ones_like(out_np)
+            out_num = num.ones_like(out_num)
+            exec(f"out_np {op}= in_np[0]")
+            exec(f"out_num {op}= in_num[0]")
 
-        # Ask cuNumeric to produce outputs to NumPy ndarrays
-        out_num = np.ones(out_np.shape, dtype=out_dtype)
-        op_num(*in_num, out_num)
+            check_result(op, in_np, out_np, out_num)
 
-        check_result(op, in_np, out_np, out_num)
+            out_num = np.ones_like(out_np)
+            exec(f"out_num {op}= in_num[0]")
+
+            check_result(op, in_np, out_np, out_num)
 
 
 def test_all():
@@ -74,8 +95,14 @@ def test_all():
     # for some boring inputs. For some of these, we will want to
     # test corner cases in the future.
 
+    # TODO: matmul, @
+
     # Math operations
     ops = [
+        "*",
+        "+",
+        "-",
+        "/",
         "add",
         # "divmod",
         "equal",
@@ -121,6 +148,7 @@ def test_all():
         check_ops(ops, (scalar1, scalar2))
 
     ops = [
+        "//",
         "arctan2",
         "copysign",
         "floor_divide",
@@ -142,6 +170,7 @@ def test_all():
         check_ops(ops, (scalar1, scalar2))
 
     ops = [
+        "**",
         "power",
         "float_power",
     ]
@@ -159,6 +188,7 @@ def test_all():
     check_ops(ops, (scalars[3], scalars[0]))
 
     ops = [
+        "%",
         "remainder",
     ]
 
@@ -173,12 +203,17 @@ def test_all():
         check_ops(ops, (scalar1, scalar2))
 
     ops = [
+        "&",
+        "<<",
+        ">>",
+        "^",
+        "|",
         "bitwise_and",
         "bitwise_or",
         "bitwise_xor",
         "gcd",
-        "left_shift",
         "lcm",
+        "left_shift",
         "right_shift",
     ]
 
diff --git a/tests/integration/test_unary_ufunc.py b/tests/integration/test_unary_ufunc.py
index c1deefe853..9d0021613c 100644
--- a/tests/integration/test_unary_ufunc.py
+++ b/tests/integration/test_unary_ufunc.py
@@ -61,39 +61,46 @@ def check_result(op, in_np, out_np, out_num, **isclose_kwargs):
 
 
 def check_op(op, in_np, out_dtype="d", **check_kwargs):
-    op_np = getattr(np, op)
-    op_num = getattr(num, op)
+    in_num = num.array(in_np)
 
-    assert op_np.nout == 1
+    if op.isidentifier():
+        op_np = getattr(np, op)
+        op_num = getattr(num, op)
 
-    in_num = num.array(in_np)
+        assert op_np.nout == 1
+
+        out_np = op_np(in_np)
+        out_num = op_num(in_num)
 
-    out_np = op_np(in_np)
-    out_num = op_num(in_num)
+        assert check_result(op, in_np, out_np, out_num, **check_kwargs)
 
-    assert check_result(op, in_np, out_np, out_num, **check_kwargs)
+        out_np = np.empty(out_np.shape, dtype=out_dtype)
+        out_num = num.empty(out_num.shape, dtype=out_dtype)
 
-    out_np = np.empty(out_np.shape, dtype=out_dtype)
-    out_num = num.empty(out_num.shape, dtype=out_dtype)
+        op_np(in_np, out=out_np)
+        op_num(in_num, out=out_num)
 
-    op_np(in_np, out=out_np)
-    op_num(in_num, out=out_num)
+        assert check_result(op, in_np, out_np, out_num, **check_kwargs)
 
-    assert check_result(op, in_np, out_np, out_num, **check_kwargs)
+        out_np = np.empty(out_np.shape, dtype=out_dtype)
+        out_num = num.empty(out_num.shape, dtype=out_dtype)
 
-    out_np = np.empty(out_np.shape, dtype=out_dtype)
-    out_num = num.empty(out_num.shape, dtype=out_dtype)
+        op_np(in_np, out_np)
+        op_num(in_num, out_num)
 
-    op_np(in_np, out_np)
-    op_num(in_num, out_num)
+        assert check_result(op, in_np, out_np, out_num, **check_kwargs)
 
-    assert check_result(op, in_np, out_np, out_num, **check_kwargs)
+        # Ask cuNumeric to produce outputs to NumPy ndarrays
+        out_num = np.ones(out_np.shape, dtype=out_dtype)
+        op_num(in_num, out_num)
 
-    # Ask cuNumeric to produce outputs to NumPy ndarrays
-    out_num = np.ones(out_np.shape, dtype=out_dtype)
-    op_num(in_num, out_num)
+        assert check_result(op, in_np, out_np, out_num, **check_kwargs)
+
+    else:
+        out_np = eval(f"{op} in_np")
+        out_num = eval(f"{op} in_num")
 
-    assert check_result(op, in_np, out_np, out_num, **check_kwargs)
+        assert check_result(op, in_np, out_np, out_num, **check_kwargs)
 
 
 def check_ops(ops, in_np, out_dtype="d"):
@@ -155,6 +162,8 @@ def check_math_ops(op, **kwargs):
 
 # Math operations
 math_ops = (
+    "+",
+    "-",
     "absolute",
     "conjugate",
     "exp",
@@ -283,7 +292,7 @@ def test_arc_hyp_trig_ops(op):
     check_op(op, np.array(np.random.uniform(low=1, high=5)))
 
 
-bit_ops = ("invert",)
+bit_ops = ("invert", "~")
 
 
 @pytest.mark.parametrize("op", bit_ops)

From 0645bc7330c5896c8b767303735a997448b42d1e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 4 Oct 2023 18:04:20 -0700
Subject: [PATCH 07/18] [pre-commit.ci] pre-commit autoupdate (#1055)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 23.3.0 → 23.9.1](https://github.com/psf/black/compare/23.3.0...23.9.1)
- [github.com/PyCQA/flake8: 6.0.0 → 6.1.0](https://github.com/PyCQA/flake8/compare/6.0.0...6.1.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b637e8ae7..03cfc8b1c8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,11 +11,11 @@ repos:
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 23.3.0
+      rev: 23.9.1
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8
-      rev: 6.0.0
+      rev: 6.1.0
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format

From 8138188a8bcccfdd3558f1133580b971a978688a Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Thu, 19 Oct 2023 12:04:09 -0700
Subject: [PATCH 08/18] Added variance as a unary reduction (#593)

* added variance as a unary reduction

* fix variance eager implementation

* build fixes

* Added more tests.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Work-around (consistent) for 1D array.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix for 1D arrays masquerading as Nd.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Added relevant comment for 1D array branch.

* Tests for keepdims.

* Clean-up.

* Fix for test_mean.py.

* Dox fix: added var entry in RST file.

* Put ignore directive back.

* Fixed doc.

* More dox fixes.

* Fix the check for the cases that trigger use of VARIANCE

* Addressed minor review comments on dox.

* Commit fixes necessary for 1475898 to work

* Addressed changes on np.square().

* Addressed changes on where arg.

* Addressed changes on module.py var doc string.

* Addressed changes on axis signature in var().

---------

Co-authored-by: Andrei Schaffer <aschaffer@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Andrei Schaffer <37386037+aschaffer@users.noreply.github.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 cunumeric/array.py                            | 151 +++++++++++--
 cunumeric/config.py                           |   4 +
 cunumeric/deferred.py                         |   4 +
 cunumeric/eager.py                            |  20 ++
 cunumeric/module.py                           |  73 +++++++
 cunumeric/utils.py                            |   9 +-
 docs/cunumeric/source/api/ndarray.rst         |   2 +-
 docs/cunumeric/source/api/statistics.rst      |   1 +
 src/cunumeric/cunumeric_c.h                   |   2 +
 .../unary/scalar_unary_red_template.inl       |  12 +-
 src/cunumeric/unary/unary_red_util.h          |  52 +++++
 tests/integration/test_stats.py               | 205 ++++++++++++++++++
 12 files changed, 507 insertions(+), 28 deletions(-)
 create mode 100644 tests/integration/test_stats.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 91ad41dde5..a219e416ac 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -54,7 +54,13 @@
 from .coverage import FALLBACK_WARNING, clone_class, is_implemented
 from .runtime import runtime
 from .types import NdShape
-from .utils import deep_apply, dot_modes, to_core_dtype
+from .utils import (
+    calculate_volume,
+    deep_apply,
+    dot_modes,
+    to_core_dtype,
+    tuple_pop,
+)
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -3079,12 +3085,40 @@ def max(
             where=where,
         )
 
+    def _summation_dtype(
+        self, dtype: Optional[np.dtype[Any]]
+    ) -> np.dtype[Any]:
+        # Pick our dtype if it wasn't picked yet
+        if dtype is None:
+            if self.dtype.kind != "f" and self.dtype.kind != "c":
+                return np.dtype(np.float64)
+            else:
+                return self.dtype
+        return dtype
+
+    def _normalize_summation(
+        self, sum_array: Any, axis: Any, dtype: np.dtype[Any], ddof: int = 0
+    ) -> None:
+        if axis is None:
+            divisor = reduce(lambda x, y: x * y, self.shape, 1) - ddof
+        else:
+            divisor = self.shape[axis] - ddof
+
+        # Divide by the number of things in the collapsed dimensions
+        # Pick the right kinds of division based on the dtype
+        if dtype.kind == "f" or dtype.kind == "c":
+            sum_array.__itruediv__(
+                np.array(divisor, dtype=sum_array.dtype),
+            )
+        else:
+            sum_array.__ifloordiv__(np.array(divisor, dtype=sum_array.dtype))
+
     @add_boilerplate()
     def mean(
         self,
         axis: Any = None,
-        dtype: Union[np.dtype[Any], None] = None,
-        out: Union[ndarray, None] = None,
+        dtype: Optional[np.dtype[Any]] = None,
+        out: Optional[ndarray] = None,
         keepdims: bool = False,
     ) -> ndarray:
         """a.mean(axis=None, dtype=None, out=None, keepdims=False)
@@ -3105,14 +3139,11 @@ def mean(
         if axis is not None and not isinstance(axis, int):
             raise NotImplementedError(
                 "cunumeric.mean only supports int types for "
-                "'axis' currently"
+                "`axis` currently"
             )
-        # Pick our dtype if it wasn't picked yet
-        if dtype is None:
-            if self.dtype.kind != "f" and self.dtype.kind != "c":
-                dtype = np.dtype(np.float64)
-            else:
-                dtype = self.dtype
+
+        dtype = self._summation_dtype(dtype)
+
         # Do the sum
         if out is not None and out.dtype == dtype:
             sum_array = self.sum(
@@ -3127,18 +3158,9 @@ def mean(
                 dtype=dtype,
                 keepdims=keepdims,
             )
-        if axis is None:
-            divisor = reduce(lambda x, y: x * y, self.shape, 1)
-        else:
-            divisor = self.shape[axis]
-        # Divide by the number of things in the collapsed dimensions
-        # Pick the right kinds of division based on the dtype
-        if dtype.kind == "f" or dtype.kind == "c":
-            sum_array.__itruediv__(
-                np.array(divisor, dtype=sum_array.dtype),
-            )
-        else:
-            sum_array.__ifloordiv__(np.array(divisor, dtype=sum_array.dtype))
+
+        self._normalize_summation(sum_array, axis, dtype)
+
         # Convert to the output we didn't already put it there
         if out is not None and sum_array is not out:
             assert out.dtype != sum_array.dtype
@@ -3147,6 +3169,91 @@ def mean(
         else:
             return sum_array
 
+    @add_boilerplate()
+    def var(
+        self,
+        axis: Optional[Union[int, tuple[int, ...]]] = None,
+        dtype: Optional[np.dtype[Any]] = None,
+        out: Optional[ndarray] = None,
+        ddof: int = 0,
+        keepdims: bool = False,
+        *,
+        where: Union[bool, ndarray] = True,
+    ) -> ndarray:
+        """a.var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False)
+
+        Returns the variance of the array elements along given axis.
+
+        Refer to :func:`cunumeric.var` for full documentation.
+
+        See Also
+        --------
+        cunumeric.var : equivalent function
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+
+        """
+        if axis is not None and not isinstance(axis, int):
+            raise NotImplementedError(
+                "cunumeric.var only supports int types for `axis` currently"
+            )
+
+        # this could be computed as a single pass through the array
+        # by computing both <x^2> and <x> and then computing <x^2> - <x>^2.
+        # this would takee the difference of two large numbers and is unstable
+        # the mean needs to be computed first and the variance computed
+        # directly as <(x-mu)^2>, which then requires two passes through the
+        # data to first compute the mean and then compute the variance
+        # see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+        # TODO(https://github.com/nv-legate/cunumeric/issues/590)
+
+        dtype = self._summation_dtype(dtype)
+        # calculate the mean, but keep the dimensions so that the
+        # mean can be broadcast against the original array
+        mu = self.mean(axis=axis, dtype=dtype, keepdims=True)
+
+        # 1D arrays (or equivalent) should benefit from this unary reduction:
+        #
+        if axis is None or calculate_volume(tuple_pop(self.shape, axis)) == 1:
+            # this is a scalar reduction and we can optimize this as a single
+            # pass through a scalar reduction
+            result = self._perform_unary_reduction(
+                UnaryRedCode.VARIANCE,
+                self,
+                axis=axis,
+                dtype=dtype,
+                out=out,
+                keepdims=keepdims,
+                where=where,
+                args=(mu,),
+            )
+        else:
+            # TODO(https://github.com/nv-legate/cunumeric/issues/591)
+            # there isn't really support for generic binary reductions
+            # right now all of the current binary reductions are boolean
+            # reductions like allclose. To implement this a single pass would
+            # require a variant of einsum/dot that produces
+            # (self-mu)*(self-mu) rather than self*mu. For now, we have to
+            # compute delta = self-mu in a first pass and then compute
+            # delta*delta in second pass
+            delta = self - mu
+
+            result = self._perform_unary_reduction(
+                UnaryRedCode.SUM_SQUARES,
+                delta,
+                axis=axis,
+                dtype=dtype,
+                out=out,
+                keepdims=keepdims,
+                where=where,
+            )
+
+        self._normalize_summation(result, axis=axis, dtype=dtype, ddof=ddof)
+
+        return result
+
     @add_boilerplate()
     def min(
         self,
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 6c5bbbb184..bdea334a16 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -187,6 +187,8 @@ class _CunumericSharedLib:
     CUNUMERIC_RED_NANSUM: int
     CUNUMERIC_RED_PROD: int
     CUNUMERIC_RED_SUM: int
+    CUNUMERIC_RED_SUM_SQUARES: int
+    CUNUMERIC_RED_VARIANCE: int
     CUNUMERIC_REPEAT: int
     CUNUMERIC_SCALAR_UNARY_RED: int
     CUNUMERIC_SCAN_GLOBAL: int
@@ -452,6 +454,8 @@ class UnaryRedCode(IntEnum):
     NANSUM = _cunumeric.CUNUMERIC_RED_NANSUM
     PROD = _cunumeric.CUNUMERIC_RED_PROD
     SUM = _cunumeric.CUNUMERIC_RED_SUM
+    SUM_SQUARES = _cunumeric.CUNUMERIC_RED_SUM_SQUARES
+    VARIANCE = _cunumeric.CUNUMERIC_RED_VARIANCE
 
 
 # Match these to CuNumericBinaryOpCode in cunumeric_c.h
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 6a56f65dbe..a67d9912d2 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -159,6 +159,8 @@ def __init__(
 
 _UNARY_RED_TO_REDUCTION_OPS: Dict[int, int] = {
     UnaryRedCode.SUM: ReductionOp.ADD,
+    UnaryRedCode.SUM_SQUARES: ReductionOp.ADD,
+    UnaryRedCode.VARIANCE: ReductionOp.ADD,
     UnaryRedCode.PROD: ReductionOp.MUL,
     UnaryRedCode.MAX: ReductionOp.MAX,
     UnaryRedCode.MIN: ReductionOp.MIN,
@@ -209,6 +211,8 @@ def min_identity(
 
 _UNARY_RED_IDENTITIES: Dict[UnaryRedCode, Callable[[Any], Any]] = {
     UnaryRedCode.SUM: lambda _: 0,
+    UnaryRedCode.SUM_SQUARES: lambda _: 0,
+    UnaryRedCode.VARIANCE: lambda _: 0,
     UnaryRedCode.PROD: lambda _: 1,
     UnaryRedCode.MIN: min_identity,
     UnaryRedCode.MAX: max_identity,
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 680f1b5a17..63284eb942 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1524,6 +1524,26 @@ def unary_reduction(
                 else where.array,
                 **kws,
             )
+        elif op == UnaryRedCode.SUM_SQUARES:
+            squared = np.square(rhs.array)
+            np.sum(
+                squared,
+                out=self.array,
+                axis=orig_axis,
+                where=where,
+                keepdims=keepdims,
+            )
+        elif op == UnaryRedCode.VARIANCE:
+            (mu,) = args
+            centered = np.subtract(rhs.array, mu)
+            squares = np.square(centered)
+            np.sum(
+                squares,
+                axis=orig_axis,
+                where=where,
+                keepdims=keepdims,
+                out=self.array,
+            )
         elif op == UnaryRedCode.CONTAINS:
             self.array.fill(args[0] in rhs.array)
         elif op == UnaryRedCode.COUNT_NONZERO:
diff --git a/cunumeric/module.py b/cunumeric/module.py
index e8d933da65..e2bbc78f7a 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -7061,6 +7061,79 @@ def mean(
     return a.mean(axis=axis, dtype=dtype, out=out, keepdims=keepdims)
 
 
+@add_boilerplate("a")
+def var(
+    a: ndarray,
+    axis: Optional[Union[int, tuple[int, ...]]] = None,
+    dtype: Optional[np.dtype[Any]] = None,
+    out: Optional[ndarray] = None,
+    ddof: int = 0,
+    keepdims: bool = False,
+    *,
+    where: Union[bool, ndarray] = True,
+) -> ndarray:
+    """
+    Compute the variance along the specified axis.
+
+    Returns the variance of the array elements, a measure of the spread of
+    a distribution. The variance is computed for the flattened array
+    by default, otherwise over the specified axis.
+
+    Parameters
+    ----------
+    a : array_like
+        Array containing numbers whose variance is desired. If `a` is not an
+        array, a conversion is attempted.
+    axis : None or int or tuple[int], optional
+        Axis or axes along which the variance is computed. The default is to
+        compute the variance of the flattened array.
+
+        If this is a tuple of ints, a variance is performed over multiple axes,
+        instead of a single axis or all the axes as before.
+    dtype : data-type, optional
+        Type to use in computing the variance. For arrays of integer type
+        the default is float64; for arrays of float types
+        it is the same as the array type.
+    out : ndarray, optional
+        Alternate output array in which to place the result. It must have the
+        same shape as the expected output, but the type is cast if necessary.
+    ddof : int, optional
+        “Delta Degrees of Freedom”: the divisor used in the calculation is
+        N - ddof, where N represents the number of elements. By default
+        ddof is zero.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the input array.
+    where : array_like of bool, optional
+        A boolean array which is broadcasted to match the dimensions of array,
+        and selects elements to include in the reduction.
+
+    Returns
+    -------
+    m : ndarray, see dtype parameter above
+        If `out=None`, returns a new array of the same dtype as above
+        containing the variance values, otherwise a reference to the output
+        array is returned.
+
+    See Also
+    --------
+    numpy.var
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    """
+    return a.var(
+        axis=axis,
+        dtype=dtype,
+        out=out,
+        ddof=ddof,
+        keepdims=keepdims,
+        where=where,
+    )
+
+
 # Histograms
 
 
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 93e45fb740..8c2d701401 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -18,7 +18,7 @@
 from functools import reduce
 from string import ascii_lowercase, ascii_uppercase
 from types import FrameType
-from typing import Any, Callable, List, Sequence, Tuple, Union
+from typing import Any, Callable, List, Sequence, Tuple, TypeVar, Union
 
 import legate.core.types as ty
 import numpy as np
@@ -108,6 +108,13 @@ def calculate_volume(shape: NdShape) -> int:
     return reduce(lambda x, y: x * y, shape)
 
 
+T = TypeVar("T")
+
+
+def tuple_pop(tup: Tuple[T, ...], index: int) -> Tuple[T, ...]:
+    return tup[:index] + tup[index + 1 :]
+
+
 Modes = Tuple[List[str], List[str], List[str]]
 
 
diff --git a/docs/cunumeric/source/api/ndarray.rst b/docs/cunumeric/source/api/ndarray.rst
index afdd1406f8..aca3b9ce0e 100644
--- a/docs/cunumeric/source/api/ndarray.rst
+++ b/docs/cunumeric/source/api/ndarray.rst
@@ -158,7 +158,7 @@ Calculation
    ndarray.sum
    ndarray.cumsum
    ndarray.mean
-   .. ndarray.var
+   ndarray.var
    .. ndarray.std
    ndarray.prod
    ndarray.cumprod
diff --git a/docs/cunumeric/source/api/statistics.rst b/docs/cunumeric/source/api/statistics.rst
index 7d844d7887..9227c93aef 100644
--- a/docs/cunumeric/source/api/statistics.rst
+++ b/docs/cunumeric/source/api/statistics.rst
@@ -10,6 +10,7 @@ Averages and variances
    :toctree: generated/
 
    mean
+   var
 
 
 Histograms
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 74c05fcd2a..b5b3928355 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -150,6 +150,8 @@ enum CuNumericUnaryRedCode {
   CUNUMERIC_RED_NANSUM,
   CUNUMERIC_RED_PROD,
   CUNUMERIC_RED_SUM,
+  CUNUMERIC_RED_SUM_SQUARES,
+  CUNUMERIC_RED_VARIANCE
 };
 
 // Match these to BinaryOpCode in config.py
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 9f073c716c..3ca19e8a7e 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -46,6 +46,7 @@ struct ScalarUnaryRed {
   Point<DIM> origin;
   Point<DIM> shape;
   RHS to_find;
+  RHS mu;
   bool dense;
 
   struct DenseReduction {};
@@ -61,6 +62,7 @@ struct ScalarUnaryRed {
 
     out = args.out.reduce_accessor<LG_OP, true, 1>();
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) { to_find = args.args[0].scalar<RHS>(); }
+    if constexpr (OP_CODE == UnaryRedCode::VARIANCE) { mu = args.args[0].scalar<RHS>(); }
 
 #ifndef LEGATE_BOUNDS_CHECKS
     // Check to see if this is dense or not
@@ -79,6 +81,8 @@ struct ScalarUnaryRed {
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
       auto p = pitches.unflatten(idx, origin);
       OP::template fold<true>(lhs, OP::convert(p, shape, identity, inptr[idx]));
+    } else if constexpr (OP_CODE == UnaryRedCode::VARIANCE) {
+      OP::template fold<true>(lhs, OP::convert(inptr[idx] - mu, identity));
     } else {
       OP::template fold<true>(lhs, OP::convert(inptr[idx], identity));
     }
@@ -86,15 +90,15 @@ struct ScalarUnaryRed {
 
   __CUDA_HD__ void operator()(LHS& lhs, size_t idx, LHS identity, SparseReduction) const noexcept
   {
+    auto p = pitches.unflatten(idx, origin);
     if constexpr (OP_CODE == UnaryRedCode::CONTAINS) {
-      auto point = pitches.unflatten(idx, origin);
-      if (in[point] == to_find) { lhs = true; }
+      if (in[p] == to_find) { lhs = true; }
     } else if constexpr (OP_CODE == UnaryRedCode::ARGMAX || OP_CODE == UnaryRedCode::ARGMIN ||
                          OP_CODE == UnaryRedCode::NANARGMAX || OP_CODE == UnaryRedCode::NANARGMIN) {
-      auto p = pitches.unflatten(idx, origin);
       OP::template fold<true>(lhs, OP::convert(p, shape, identity, in[p]));
+    } else if constexpr (OP_CODE == UnaryRedCode::VARIANCE) {
+      OP::template fold<true>(lhs, OP::convert(in[p] - mu, identity));
     } else {
-      auto p = pitches.unflatten(idx, origin);
       OP::template fold<true>(lhs, OP::convert(in[p], identity));
     }
   }
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 34d92710bf..e822e40b45 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -40,6 +40,8 @@ enum class UnaryRedCode : int {
   NANSUM        = CUNUMERIC_RED_NANSUM,
   PROD          = CUNUMERIC_RED_PROD,
   SUM           = CUNUMERIC_RED_SUM,
+  SUM_SQUARES   = CUNUMERIC_RED_SUM_SQUARES,
+  VARIANCE      = CUNUMERIC_RED_VARIANCE
 };
 
 template <UnaryRedCode OP_CODE>
@@ -89,6 +91,10 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&..
       return f.template operator()<UnaryRedCode::PROD>(std::forward<Fnargs>(args)...);
     case UnaryRedCode::SUM:
       return f.template operator()<UnaryRedCode::SUM>(std::forward<Fnargs>(args)...);
+    case UnaryRedCode::SUM_SQUARES:
+      return f.template operator()<UnaryRedCode::SUM_SQUARES>(std::forward<Fnargs>(args)...);
+    case UnaryRedCode::VARIANCE:
+      return f.template operator()<UnaryRedCode::VARIANCE>(std::forward<Fnargs>(args)...);
     default: break;
   }
   assert(false);
@@ -264,6 +270,52 @@ struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs, const VAL) { return rhs; }
 };
 
+template <legate::Type::Code TYPE_CODE>
+struct UnaryRedOp<UnaryRedCode::SUM_SQUARES, TYPE_CODE> {
+  static constexpr bool valid = true;
+
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
+  using OP  = Legion::SumReduction<VAL>;
+
+  template <bool EXCLUSIVE>
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const VAL, const RHS& rhs)
+  {
+    return rhs * rhs;
+  }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs, const VAL) { return rhs * rhs; }
+};
+
+template <legate::Type::Code TYPE_CODE>
+struct UnaryRedOp<UnaryRedCode::VARIANCE, TYPE_CODE> {
+  static constexpr bool valid = true;
+
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
+  using OP  = Legion::SumReduction<VAL>;
+
+  template <bool EXCLUSIVE>
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const VAL, const RHS& rhs)
+  {
+    return rhs * rhs;
+  }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs, const VAL) { return rhs * rhs; }
+};
+
 template <legate::Type::Code TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
   static constexpr bool valid = !legate::is_complex<TYPE_CODE>::value;
diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py
new file mode 100644
index 0000000000..dfa1b0fa33
--- /dev/null
+++ b/tests/integration/test_stats.py
@@ -0,0 +1,205 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import functools
+
+import numpy as np
+import pytest
+from utils.comparisons import allclose
+
+import cunumeric as num
+
+np.random.seed(143)
+
+
+def check_result(in_np, out_np, out_num, **isclose_kwargs):
+    if in_np.dtype == "e" or out_np.dtype == "e":
+        # The mantissa is only 10 bits, 2**-10 ~= 10^(-4)
+        # Gives 1e-3 as rtol to provide extra rounding error.
+        f16_rtol = 1e-2
+        rtol = isclose_kwargs.setdefault("rtol", f16_rtol)
+        # make sure we aren't trying to fp16 compare with less precision
+        assert rtol >= f16_rtol
+
+    if "negative_test" in isclose_kwargs:
+        is_negative_test = isclose_kwargs["negative_test"]
+    else:
+        is_negative_test = False
+
+    result = (
+        allclose(out_np, out_num, **isclose_kwargs)
+        and out_np.dtype == out_num.dtype
+    )
+    if not result and not is_negative_test:
+        print("cunumeric failed the test")
+        print("Input:")
+        print(in_np)
+        print(f"dtype: {in_np.dtype}")
+        print("NumPy output:")
+        print(out_np)
+        print(f"dtype: {out_np.dtype}")
+        print("cuNumeric output:")
+        print(out_num)
+        print(f"dtype: {out_num.dtype}")
+    return result
+
+
+def check_op(op_np, op_num, in_np, out_dtype, **check_kwargs):
+    in_num = num.array(in_np)
+
+    out_np = op_np(in_np)
+    out_num = op_num(in_num)
+
+    assert check_result(in_np, out_np, out_num, **check_kwargs)
+
+    out_np = np.empty(out_np.shape, dtype=out_dtype)
+    out_num = num.empty(out_num.shape, dtype=out_dtype)
+
+    op_np(in_np, out=out_np)
+    op_num(in_num, out=out_num)
+
+    assert check_result(in_np, out_np, out_num, **check_kwargs)
+
+
+def get_op_input(
+    shape=(4, 5),
+    a_min=None,
+    a_max=None,
+    randint=False,
+    offset=None,
+    astype=None,
+    out_dtype="d",
+    replace_zero=None,
+    **check_kwargs,
+):
+    if randint:
+        assert a_min is not None
+        assert a_max is not None
+        in_np = np.random.randint(a_min, a_max, size=shape)
+    else:
+        in_np = np.random.randn(*shape)
+        if offset is not None:
+            in_np = in_np + offset
+        if a_min is not None:
+            in_np = np.maximum(a_min, in_np)
+        if a_max is not None:
+            in_np = np.minimum(a_max, in_np)
+        if astype is not None:
+            in_np = in_np.astype(astype)
+
+    if replace_zero is not None:
+        in_np[in_np == 0] = replace_zero
+
+    # converts to a scalar if shape is (1,)
+    if in_np.ndim == 1 and in_np.shape[0] == 1:
+        in_np = in_np[0]
+
+    return in_np
+
+
+dtypes = (
+    "e",
+    "f",
+    "d",
+)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("ddof", [0, 1])
+@pytest.mark.parametrize("axis", [None, 0, 1])
+@pytest.mark.parametrize("keepdims", [False, True])
+def test_var_default_shape(dtype, ddof, axis, keepdims):
+    np_in = get_op_input(astype=dtype)
+
+    op_np = functools.partial(np.var, ddof=ddof, axis=axis, keepdims=keepdims)
+    op_num = functools.partial(
+        num.var, ddof=ddof, axis=axis, keepdims=keepdims
+    )
+
+    check_op(op_np, op_num, np_in, dtype)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("ddof", [0, 1])
+@pytest.mark.parametrize("axis", [None, 0, 1, 2])
+@pytest.mark.parametrize("shape", [(10,), (4, 5), (2, 3, 4)])
+def test_var_w_shape(dtype, ddof, axis, shape):
+    np_in = get_op_input(astype=dtype, shape=shape)
+
+    if axis is not None and axis >= len(shape):
+        axis = None
+
+    op_np = functools.partial(np.var, ddof=ddof, axis=axis)
+    op_num = functools.partial(num.var, ddof=ddof, axis=axis)
+
+    check_op(op_np, op_num, np_in, dtype)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("ddof", [0, 1])
+@pytest.mark.parametrize(
+    "axis",
+    [
+        None,
+    ],
+)
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (10, 1),
+    ],
+)
+def test_var_corners(dtype, ddof, axis, shape):
+    np_in = get_op_input(astype=dtype, shape=shape)
+
+    if axis is not None and axis >= len(shape):
+        axis = None
+
+    op_np = functools.partial(np.var, ddof=ddof, axis=axis)
+    op_num = functools.partial(num.var, ddof=ddof, axis=axis)
+
+    check_op(op_np, op_num, np_in, dtype)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("ddof", [0, 1])
+@pytest.mark.parametrize(
+    "axis",
+    [
+        None,
+    ],
+)
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (1,),
+    ],
+)
+def test_var_xfail(dtype, ddof, axis, shape):
+    np_in = get_op_input(astype=dtype, shape=shape)
+
+    op_np = functools.partial(np.var, ddof=ddof, axis=axis)
+    op_num = functools.partial(num.var, ddof=ddof, axis=axis)
+
+    check_op(op_np, op_num, np_in, dtype, negative_test=True)
+
+
+if __name__ == "__main__":
+    import sys
+
+    np.random.seed(12345)
+
+    sys.exit(pytest.main(sys.argv))

From 2666c08bbdb2ad5f83876ac8ed5c61bbe01af712 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Thu, 26 Oct 2023 16:33:34 -0700
Subject: [PATCH 09/18] Create bug_report.yml (#1062)

---
 .github/ISSUE_TEMPLATE/bug_report.yml | 97 +++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000000..8bf716ed17
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,97 @@
+name: Bug report
+description: Submit a bug report
+title: "[BUG] "
+labels: TRIAGE
+body:
+  - type: markdown
+    attributes:
+      value: "# Bug report"
+  - type: markdown
+    attributes:
+      value: Thank you for reporting a bug and helping us improve Cunumeric!
+  - type: markdown
+    attributes:
+      value: >
+        Please fill out all of the required information.
+  - type: markdown
+    attributes:
+      value: |
+        ---
+        ## Environment information
+  - type: textarea
+    id: legate_issue
+    attributes:
+      label: Software versions
+      description: >-
+        Run `legate-issue` and paste the output here.
+      placeholder: |
+        Python      :  3.10.11 | packaged by conda-forge | (main, May 10 2023, 18:58:44) [GCC 11.3.0]
+        Platform    :  Linux-5.14.0-1042-oem-x86_64-with-glibc2.31
+        Legion      :  v23.11.00.dev-16-g2499f878
+        Legate      :  23.11.00.dev+17.gb7b50313
+        Cunumeric   :  (ImportError: cannot import name 'LogicalArray' from 'legate.core')
+        Numpy       :  1.24.4
+        Scipy       :  1.10.1
+        Numba       :  (not installed)
+        CTK package : cuda-version-11.8-h70ddcb2_2 (conda-forge)
+        GPU Driver  : 515.65.01
+        GPU Devices :
+          GPU 0: Quadro RTX 8000
+          GPU 1: Quadro RTX 8000
+    validations:
+      required: true
+  - type: input
+    id: jupyter
+    attributes:
+      label: Jupyter notebook / Jupyter Lab version
+      description: >-
+        Please supply if the issue you are reporting is related to Jupyter
+        notebook or Jupyter Lab.
+    validations:
+      required: false
+  - type: markdown
+    attributes:
+      value: |
+        ## Issue details
+  - type: textarea
+    id: expected-behavior
+    attributes:
+      label: Expected behavior
+      description: What did you expect to happen?
+    validations:
+      required: true
+  - type: textarea
+    id: observed-behavior
+    attributes:
+      label: Observed behavior
+      description: What did actually happen?
+    validations:
+      required: true
+  - type: markdown
+    attributes:
+      value: |
+        ## Directions to reproduce
+  - type: textarea
+    id: example
+    attributes:
+      label: Example code or instructions
+      description: >
+        Please provide detailed instructions to reproduce the issue. Ideally this includes a 
+        [Complete, minimal, self-contained example code](https://stackoverflow.com/help/minimal-reproducible-example)
+        given here or as a link to code in another repository. 
+      render: Python
+    validations:
+      required: true
+  - type: markdown
+    attributes:
+      value: |
+        ## Additional information
+  - type: textarea
+    id: traceback-console
+    attributes:
+      label: Stack traceback or browser console output
+      description: >
+        Add any error messages or logs that might be helpful in reproducing and
+        identifying the bug, for example a Python stack traceback.
+    validations:
+      required: false

From 3fdc85600b171a3b83ca45b0746efb3080684237 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 2 Nov 2023 15:24:31 -0700
Subject: [PATCH 10/18] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3ed163b571..78924b360a 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ with cuNumeric going forward:
   new features to cuNumeric.
 * We plan to add support for sharded file I/O for loading and
   storing large data sets that could never be loaded on a single node.
-  Initially this will begin with native support for [h5py](https://www.h5py.org/)
+  Initially this will begin with native support for hdf5 and zarr,
   but will grow to accommodate other formats needed by our lighthouse
   applications.
 * Strong scaling: while cuNumeric is currently implemented in a way that

From 586a005ab6bb1536af456a735071cf320351167b Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 3 Nov 2023 09:50:19 -0700
Subject: [PATCH 11/18] Fix #1065 (#1067)

* Fix #1065

* Bump legate.core commit
---
 cmake/versions.json | 2 +-
 cunumeric/eager.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/versions.json b/cmake/versions.json
index b99da26cb4..93a1d80010 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -5,7 +5,7 @@
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "8997f997be02936304b3ac23fe785f1de7a3424b"
+      "git_tag" : "a4b5430ebb2c52e3f8da8f27534bc0db8826b804"
     }
   }
 }
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 63284eb942..26fc980168 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -1615,7 +1615,7 @@ def where(self, rhs1: Any, rhs2: Any, rhs3: Any) -> None:
         if self.deferred is not None:
             self.deferred.where(rhs1, rhs2, rhs3)
         else:
-            self.array[:] = np.where(rhs1.array, rhs2.array, rhs3.array)
+            self.array[...] = np.where(rhs1.array, rhs2.array, rhs3.array)
 
     def argwhere(self) -> NumPyThunk:
         if self.deferred is not None:

From f8c94f022947c72de121f38a6301ec322c4c11bb Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 6 Nov 2023 15:24:19 -0800
Subject: [PATCH 12/18] Suggest using mamba over conda (#1068)

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 78924b360a..7516331ff0 100644
--- a/README.md
+++ b/README.md
@@ -40,15 +40,15 @@ If you have questions, please contact us at legate(at)nvidia.com.
 cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
 
 ```
-conda install -c nvidia -c conda-forge -c legate cunumeric
+mamba install -c nvidia -c conda-forge -c legate cunumeric
 ```
 
 Only linux-64 packages are available at the moment.
 
 The default package contains GPU support, and is compatible with CUDA >= 11.8
 (CUDA driver version >= r520), and Volta or later GPU architectures. There are
-also CPU-only packages available, and will be automatically selected by `conda`
-when installing on a machine without GPUs.
+also CPU-only packages available, and will be automatically selected when
+installing on a machine without GPUs.
 
 See the build instructions at https://nv-legate.github.io/cunumeric for details
 about building cuNumeric from source.

From a010564e568648f8fe2c8f7b9aadf5f0a9bcca59 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 6 Nov 2023 16:02:04 -0800
Subject: [PATCH 13/18] Missing CUDA libs are now included in main conda env
 get script (#1058)

* CUDA libraries now included in main conda env generation script

* Bump legate.core ref to commit with updated generate-conda-envs
---
 continuous_integration/scripts/build-cunumeric-all | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/continuous_integration/scripts/build-cunumeric-all b/continuous_integration/scripts/build-cunumeric-all
index bcdbf62ec5..66f5ccb6e0 100755
--- a/continuous_integration/scripts/build-cunumeric-all
+++ b/continuous_integration/scripts/build-cunumeric-all
@@ -3,12 +3,6 @@
 setup_env() {
     yaml_file=$(find ~/.artifacts -name "environment*.yaml" | head -n 1)
 
-    [ "${USE_CUDA:-}" = "ON" ] &&
-        echo "  - libcublas-dev" >> "${yaml_file}" &&
-        echo "  - libcufft-dev" >> "${yaml_file}" &&
-        echo "  - libcurand-dev" >> "${yaml_file}" &&
-        echo "  - libcusolver-dev" >> "${yaml_file}";
-
     echo "YAML file..."
     cat "${yaml_file}"
 

From fce99c0316f1311f9468ff593b6e7a1a4df5a237 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 6 Nov 2023 16:02:31 -0800
Subject: [PATCH 14/18] Typos (#1063)

---
 scripts/util/build-caching.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/util/build-caching.sh b/scripts/util/build-caching.sh
index 70de985d32..9fb4c1b4a4 100755
--- a/scripts/util/build-caching.sh
+++ b/scripts/util/build-caching.sh
@@ -7,9 +7,9 @@ if [[ -n "$(which sccache)" ]]; then
     CMAKE_CUDA_COMPILER_LAUNCHER="${CMAKE_CUDA_COMPILER_LAUNCHER:-$(which sccache)}";
 elif [[ -n "$(which ccache)" ]]; then
     # Use ccache if installed
-    CMAKE_C_COMPILER_LAUNCHER="${CMAKE_C_COMPILER_LAUNCHER:-$(which cache)}";
-    CMAKE_CXX_COMPILER_LAUNCHER="${CMAKE_CXX_COMPILER_LAUNCHER:-$(which cache)}";
-    CMAKE_CUDA_COMPILER_LAUNCHER="${CMAKE_CUDA_COMPILER_LAUNCHER:-$(which cache)}";
+    CMAKE_C_COMPILER_LAUNCHER="${CMAKE_C_COMPILER_LAUNCHER:-$(which ccache)}";
+    CMAKE_CXX_COMPILER_LAUNCHER="${CMAKE_CXX_COMPILER_LAUNCHER:-$(which ccache)}";
+    CMAKE_CUDA_COMPILER_LAUNCHER="${CMAKE_CUDA_COMPILER_LAUNCHER:-$(which ccache)}";
 fi
 
 export CMAKE_C_COMPILER_LAUNCHER="$CMAKE_C_COMPILER_LAUNCHER"

From 8c67416c95583ca29e9a712e1cb2060ec57fd565 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 7 Nov 2023 10:08:05 -0800
Subject: [PATCH 15/18] Fixes #1069, #1070 (#1072)

* Find handling of optimize=True in einsum

* Use einsum path optimizer by default

cuNumeric can only contract two arrays at a time, so the naive
input-order contraction path can easily result in huge intermediates.

* Bump legate.core git hash
---
 cmake/versions.json              |  2 +-
 cunumeric/module.py              | 13 ++++++++-----
 tests/integration/test_einsum.py |  4 +---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/cmake/versions.json b/cmake/versions.json
index 93a1d80010..43d60fa5e1 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -5,7 +5,7 @@
       "git_url" : "https://github.com/nv-legate/legate.core.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "a4b5430ebb2c52e3f8da8f27534bc0db8826b804"
+      "git_tag" : "6fa0acc9dcfa89be2702f1de6c045bc262f752b1"
     }
   }
 }
diff --git a/cunumeric/module.py b/cunumeric/module.py
index e2bbc78f7a..47c4dea90e 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4587,7 +4587,7 @@ def einsum(
     out: Optional[ndarray] = None,
     dtype: Optional[np.dtype[Any]] = None,
     casting: CastingKind = "safe",
-    optimize: Union[bool, str] = False,
+    optimize: Union[bool, Literal["greedy", "optimal"]] = True,
 ) -> ndarray:
     """
     Evaluates the Einstein summation convention on the operands.
@@ -4628,9 +4628,10 @@ def einsum(
 
         Default is 'safe'.
     optimize : ``{False, True, 'greedy', 'optimal'}``, optional
-        Controls if intermediate optimization should occur. No optimization
-        will occur if False. Uses opt_einsum to find an optimized contraction
-        plan if True.
+        Controls if intermediate optimization should occur. If False then
+        arrays will be contracted in input order, one at a time. True (the
+        default) will use the 'greedy' algorithm. See ``cunumeric.einsum_path``
+        for more information on the available optimization algorithms.
 
     Returns
     -------
@@ -4654,7 +4655,9 @@ def einsum(
     if out is not None:
         out = convert_to_cunumeric_ndarray(out, share=True)
 
-    if not optimize:
+    if optimize is True:
+        optimize = "greedy"
+    elif optimize is False:
         optimize = NullOptimizer()
 
     # This call normalizes the expression (adds the output part if it's
diff --git a/tests/integration/test_einsum.py b/tests/integration/test_einsum.py
index e482e8cf09..4fcdd2402f 100644
--- a/tests/integration/test_einsum.py
+++ b/tests/integration/test_einsum.py
@@ -272,7 +272,7 @@ def test_cast(expr, dtype):
         False,
         "optimal",
         "greedy",
-        pytest.param(True, marks=pytest.mark.xfail),
+        True,
     ],
 )
 def test_optimize(optimize):
@@ -282,8 +282,6 @@ def test_optimize(optimize):
     np_res = np.einsum("ik,kj->ij", a, b, optimize=optimize)
     num_res = num.einsum("ik,kj->ij", a, b, optimize=optimize)
     assert allclose(np_res, num_res)
-    # when optimize=True, cunumeric raises
-    # TypeError: 'bool' object is not iterable
 
 
 def test_expr_opposite():

From b66e2ecdf7d245fd9b754b6f4ef6d42c7ed6681d Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Tue, 7 Nov 2023 23:42:26 -0800
Subject: [PATCH 16/18] Add batched cholesky implementation and tests (#1029)

* add batched cholesky implementation and tests

* missing files

* fix correctness issues in transpose lower implementation

* address PR comments

* remove print statements

* address more PR comments

* test fixes

* remove outdated comment

* Add missing "throws exception" annotation

---------

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/config.py                           |   2 +
 cunumeric/linalg/cholesky.py                  |  40 ++++-
 cunumeric/linalg/linalg.py                    |   4 -
 cunumeric_cpp.cmake                           |   3 +
 src/cunumeric/cunumeric_c.h                   |   1 +
 src/cunumeric/mapper.cc                       |  19 +++
 src/cunumeric/matrix/batched_cholesky.cc      |  85 ++++++++++
 src/cunumeric/matrix/batched_cholesky.cu      | 111 +++++++++++++
 src/cunumeric/matrix/batched_cholesky.h       |  38 +++++
 src/cunumeric/matrix/batched_cholesky_omp.cc  |  83 ++++++++++
 .../matrix/batched_cholesky_template.inl      | 147 ++++++++++++++++++
 src/cunumeric/matrix/potrf.cc                 |  72 ++++-----
 src/cunumeric/matrix/potrf.cu                 |  55 ++++---
 src/cunumeric/matrix/potrf_omp.cc             |  72 ++++-----
 src/cunumeric/matrix/potrf_template.inl       |  20 +++
 tests/integration/test_cholesky.py            |  49 +++++-
 tests/unit/cunumeric/test_config.py           |   1 +
 17 files changed, 692 insertions(+), 110 deletions(-)
 create mode 100644 src/cunumeric/matrix/batched_cholesky.cc
 create mode 100644 src/cunumeric/matrix/batched_cholesky.cu
 create mode 100644 src/cunumeric/matrix/batched_cholesky.h
 create mode 100644 src/cunumeric/matrix/batched_cholesky_omp.cc
 create mode 100644 src/cunumeric/matrix/batched_cholesky_template.inl

diff --git a/cunumeric/config.py b/cunumeric/config.py
index bdea334a16..635544bd86 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -32,6 +32,7 @@ class _CunumericSharedLib:
     CUNUMERIC_ADVANCED_INDEXING: int
     CUNUMERIC_ARANGE: int
     CUNUMERIC_ARGWHERE: int
+    CUNUMERIC_BATCHED_CHOLESKY: int
     CUNUMERIC_BINARY_OP: int
     CUNUMERIC_BINARY_RED: int
     CUNUMERIC_BINCOUNT: int
@@ -333,6 +334,7 @@ class CuNumericOpCode(IntEnum):
     ADVANCED_INDEXING = _cunumeric.CUNUMERIC_ADVANCED_INDEXING
     ARANGE = _cunumeric.CUNUMERIC_ARANGE
     ARGWHERE = _cunumeric.CUNUMERIC_ARGWHERE
+    BATCHED_CHOLESKY = _cunumeric.CUNUMERIC_BATCHED_CHOLESKY
     BINARY_OP = _cunumeric.CUNUMERIC_BINARY_OP
     BINARY_RED = _cunumeric.CUNUMERIC_BINARY_RED
     BINCOUNT = _cunumeric.CUNUMERIC_BINCOUNT
diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py
index 9bba033619..4ff4fe2127 100644
--- a/cunumeric/linalg/cholesky.py
+++ b/cunumeric/linalg/cholesky.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -202,11 +202,47 @@ def tril(context: Context, p_output: StorePartition, n: int) -> None:
     task.execute()
 
 
+def _batched_cholesky(output: DeferredArray, input: DeferredArray) -> None:
+    # the only feasible implementation for right now is that
+    # each cholesky submatrix fits on a single proc. We will have
+    # wildly varying memory available depending on the system.
+    # Just use a fixed cutoff to provide some sensible warning.
+    # TODO: find a better way to inform the user dims are too big
+    context: Context = output.context
+    task = context.create_auto_task(CuNumericOpCode.BATCHED_CHOLESKY)
+    task.add_input(input.base)
+    task.add_output(output.base)
+    ndim = input.base.ndim
+    task.add_broadcast(input.base, (ndim - 2, ndim - 1))
+    task.add_broadcast(output.base, (ndim - 2, ndim - 1))
+    task.add_alignment(input.base, output.base)
+    task.throws_exception(LinAlgError)
+    task.execute()
+
+
 def cholesky(
     output: DeferredArray, input: DeferredArray, no_tril: bool
 ) -> None:
     runtime = output.runtime
-    context = output.context
+    context: Context = output.context
+    if len(input.base.shape) > 2:
+        if no_tril:
+            raise NotImplementedError(
+                "batched cholesky expects to only "
+                "produce the lower triangular matrix"
+            )
+        size = input.base.shape[-1]
+        # Choose 32768 as dimension cutoff for warning
+        # so that for float64 anything larger than
+        # 8 GiB produces a warning
+        if size > 32768:
+            runtime.warn(
+                "batched cholesky is only valid"
+                " when the square submatrices fit"
+                f" on a single proc, n > {size} may be too large",
+                category=UserWarning,
+            )
+        return _batched_cholesky(output, input)
 
     if runtime.num_procs == 1:
         transpose_copy_single(context, input.base, output.base)
diff --git a/cunumeric/linalg/linalg.py b/cunumeric/linalg/linalg.py
index f3f7eb9fb8..d1c0498b2e 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cunumeric/linalg/linalg.py
@@ -82,10 +82,6 @@ def cholesky(a: ndarray) -> ndarray:
     elif shape[-1] != shape[-2]:
         raise ValueError("Last 2 dimensions of the array must be square")
 
-    if len(shape) > 2:
-        raise NotImplementedError(
-            "cuNumeric needs to support stacked 2d arrays"
-        )
     return _cholesky(a)
 
 
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index 4270962bae..f7feee620e 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -143,6 +143,7 @@ list(APPEND cunumeric_SOURCES
   src/cunumeric/index/putmask.cc
   src/cunumeric/item/read.cc
   src/cunumeric/item/write.cc
+  src/cunumeric/matrix/batched_cholesky.cc
   src/cunumeric/matrix/contract.cc
   src/cunumeric/matrix/diag.cc
   src/cunumeric/matrix/gemm.cc
@@ -195,6 +196,7 @@ if(Legion_USE_OpenMP)
     src/cunumeric/index/repeat_omp.cc
     src/cunumeric/index/wrap_omp.cc
     src/cunumeric/index/zip_omp.cc
+    src/cunumeric/matrix/batched_cholesky_omp.cc
     src/cunumeric/matrix/contract_omp.cc
     src/cunumeric/matrix/diag_omp.cc
     src/cunumeric/matrix/gemm_omp.cc
@@ -245,6 +247,7 @@ if(Legion_USE_CUDA)
     src/cunumeric/index/putmask.cu
     src/cunumeric/item/read.cu
     src/cunumeric/item/write.cu
+    src/cunumeric/matrix/batched_cholesky.cu
     src/cunumeric/matrix/contract.cu
     src/cunumeric/matrix/diag.cu
     src/cunumeric/matrix/gemm.cu
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index b5b3928355..99d9bea191 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -29,6 +29,7 @@ enum CuNumericOpCode {
   CUNUMERIC_ADVANCED_INDEXING,
   CUNUMERIC_ARANGE,
   CUNUMERIC_ARGWHERE,
+  CUNUMERIC_BATCHED_CHOLESKY,
   CUNUMERIC_BINARY_OP,
   CUNUMERIC_BINARY_RED,
   CUNUMERIC_BINCOUNT,
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 247ded4fdd..ba7114e45f 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -145,6 +145,25 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
+    // CHANGE: If this code is changed, make sure all layouts are
+    // consistent with those assumed in batched_cholesky.cu, etc
+    case CUNUMERIC_BATCHED_CHOLESKY: {
+      std::vector<StoreMapping> mappings;
+      auto& inputs  = task.inputs();
+      auto& outputs = task.outputs();
+      mappings.reserve(inputs.size() + outputs.size());
+      for (auto& input : inputs) {
+        mappings.push_back(StoreMapping::default_mapping(input, options.front()));
+        mappings.back().policy.exact = true;
+        mappings.back().policy.ordering.set_c_order();
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(StoreMapping::default_mapping(output, options.front()));
+        mappings.back().policy.exact = true;
+        mappings.back().policy.ordering.set_c_order();
+      }
+      return std::move(mappings);
+    }
     case CUNUMERIC_TRILU: {
       if (task.scalars().size() == 2) return {};
       // If we're here, this task was the post-processing for Cholesky.
diff --git a/src/cunumeric/matrix/batched_cholesky.cc b/src/cunumeric/matrix/batched_cholesky.cc
new file mode 100644
index 0000000000..30dbe3c53d
--- /dev/null
+++ b/src/cunumeric/matrix/batched_cholesky.cc
@@ -0,0 +1,85 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include <cblas.h>
+#include <core/type/type_info.h>
+#include <lapack.h>
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <>
+void CopyBlockImpl<VariantKind::CPU>::operator()(void* dst, const void* src, size_t size)
+{
+  ::memcpy(dst, src, size);
+}
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::CPU, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  static constexpr int tile_size = 64;
+
+  void operator()(VAL* out, int n) const
+  {
+    VAL tile[tile_size][tile_size];
+    int nblocks = (n + tile_size - 1) / tile_size;
+
+    for (int rb = 0; rb < nblocks; ++rb) {
+      for (int cb = 0; cb < nblocks; ++cb) {
+        int r_start = rb * tile_size;
+        int r_stop  = std::min(r_start + tile_size, n);
+        int c_start = cb * tile_size;
+        int c_stop  = std::min(c_start + tile_size, n);
+        for (int r = r_start, tr = 0; r < r_stop; ++r, ++tr) {
+          for (int c = c_start, tc = 0; c < c_stop; ++c, ++tc) {
+            if (r <= c) {
+              tile[tr][tc] = out[r * n + c];
+            } else {
+              tile[tr][tc] = 0;
+            }
+          }
+        }
+        for (int r = c_start, tr = 0; r < c_stop; ++r, ++tr) {
+          for (int c = r_start, tc = 0; c < r_stop; ++c, ++tc) { out[r * n + c] = tile[tc][tr]; }
+        }
+      }
+    }
+  }
+};
+
+/*static*/ void BatchedCholeskyTask::cpu_variant(TaskContext& context)
+{
+#ifdef LEGATE_USE_OPENMP
+  openblas_set_num_threads(1);  // make sure this isn't overzealous
+#endif
+  batched_cholesky_task_context_dispatch<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void)
+{
+  BatchedCholeskyTask::register_variants();
+}
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.cu b/src/cunumeric/matrix/batched_cholesky.cu
new file mode 100644
index 0000000000..26fe3058f7
--- /dev/null
+++ b/src/cunumeric/matrix/batched_cholesky.cu
@@ -0,0 +1,111 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/potrf.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace legate;
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+template <>
+void CopyBlockImpl<VariantKind::GPU>::operator()(void* dst, const void* src, size_t size)
+{
+  cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, get_cached_stream());
+}
+
+template <typename VAL>
+__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
+  transpose_2d_lower(VAL* out, int n)
+{
+  __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];
+
+  // The y dim is fast-moving index for coalescing
+  auto r_block = blockIdx.x * TILE_DIM;
+  auto c_block = blockIdx.y * TILE_DIM;
+  auto r       = blockIdx.x * TILE_DIM + threadIdx.x;
+  auto c       = blockIdx.y * TILE_DIM + threadIdx.y;
+  auto stride  = BLOCK_ROWS;
+  // The tile coordinates
+  auto tr     = threadIdx.x;
+  auto tc     = threadIdx.y;
+  auto offset = r * n + c;
+
+  // only execute across the upper diagonal
+  // a single thread block will store the upper diagonal block into
+  // a temp shared memory then set the block to zeros
+  if (c_block >= r_block) {
+#pragma unroll
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride) {
+      if (r < n && (c + i) < n) {
+        if (r <= (c + i)) {
+          tile[tr][tc + i] = out[offset];
+          // clear the upper diagonal entry
+          out[offset] = 0;
+        } else {
+          tile[tr][tc + i] = 0;
+        }
+      }
+    }
+
+    // Make sure all the data is in shared memory
+    __syncthreads();
+
+    // Transpose the global coordinates, keep y the fast-moving index
+    r      = blockIdx.y * TILE_DIM + threadIdx.x;
+    c      = blockIdx.x * TILE_DIM + threadIdx.y;
+    offset = r * n + c;
+
+#pragma unroll
+    for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride) {
+      if (r < n && (c + i) < n) {
+        if (r >= (c + i)) { out[offset] = tile[tc + i][tr]; }
+      }
+    }
+  }
+}
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::GPU, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* out, int n) const
+  {
+    const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (n + TILE_DIM - 1) / TILE_DIM, 1);
+    const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);
+
+    auto stream = get_cached_stream();
+
+    // CUDA Potrf produces the full matrix, we only want
+    // the lower diagonal
+    transpose_2d_lower<VAL><<<blocks, threads, 0, stream>>>(out, n);
+
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+/*static*/ void BatchedCholeskyTask::gpu_variant(TaskContext& context)
+{
+  batched_cholesky_task_context_dispatch<VariantKind::GPU>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.h b/src/cunumeric/matrix/batched_cholesky.h
new file mode 100644
index 0000000000..fceba2a9f9
--- /dev/null
+++ b/src/cunumeric/matrix/batched_cholesky.h
@@ -0,0 +1,38 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/cunumeric_c.h"
+
+namespace cunumeric {
+
+class BatchedCholeskyTask : public CuNumericTask<BatchedCholeskyTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_BATCHED_CHOLESKY;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky_omp.cc b/src/cunumeric/matrix/batched_cholesky_omp.cc
new file mode 100644
index 0000000000..84b311ff25
--- /dev/null
+++ b/src/cunumeric/matrix/batched_cholesky_omp.cc
@@ -0,0 +1,83 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include <cblas.h>
+#include <lapack.h>
+#include <omp.h>
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <>
+void CopyBlockImpl<VariantKind::OMP>::operator()(void* dst, const void* src, size_t n)
+{
+  ::memcpy(dst, src, n);
+}
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::OMP, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  static constexpr int tile_size = 64;
+
+  void operator()(VAL* out, int n) const
+  {
+    int nblocks = (n + tile_size - 1) / tile_size;
+
+#pragma omp parallel for
+    for (int rb = 0; rb < nblocks; ++rb) {
+      // only loop the upper diagonal
+      // transpose the elements that are there and
+      // zero out the elements after reading them
+      for (int cb = rb; cb < nblocks; ++cb) {
+        VAL tile[tile_size][tile_size];
+        int r_start = rb * tile_size;
+        int r_stop  = std::min(r_start + tile_size, n);
+        int c_start = cb * tile_size;
+        int c_stop  = std::min(c_start + tile_size, n);
+
+        for (int r = r_start, tr = 0; r < r_stop; ++r, ++tr) {
+          for (int c = c_start, tc = 0; c < c_stop; ++c, ++tc) {
+            if (r <= c) {
+              auto offset  = r * n + c;
+              tile[tr][tc] = out[offset];
+              out[offset]  = 0;
+            } else {
+              tile[tr][tc] = 0;
+            }
+          }
+        }
+
+        for (int r = c_start, tr = 0; r < c_stop; ++r, ++tr) {
+          for (int c = r_start, tc = 0; c < r_stop; ++c, ++tc) { out[r * n + c] = tile[tc][tr]; }
+        }
+      }
+    }
+  }
+};
+
+/*static*/ void BatchedCholeskyTask::omp_variant(TaskContext& context)
+{
+  openblas_set_num_threads(omp_get_max_threads());
+  batched_cholesky_task_context_dispatch<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky_template.inl b/src/cunumeric/matrix/batched_cholesky_template.inl
new file mode 100644
index 0000000000..8d266e3f06
--- /dev/null
+++ b/src/cunumeric/matrix/batched_cholesky_template.inl
@@ -0,0 +1,147 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+// Useful for IDEs
+#include <core/task/exception.h>
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/potrf_template.inl"
+#include "cunumeric/matrix/transpose_template.inl"
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <VariantKind KIND, Type::Code CODE>
+struct BatchedCholeskyImplBody {
+  template <class T>
+  void operator()(T* array, int32_t m, int32_t n)
+  {
+    PotrfImplBody<KIND, CODE>()(array, m, n);
+  }
+};
+
+template <VariantKind KIND>
+struct CopyBlockImpl {
+  void operator()(void* dst, const void* src, size_t n);
+};
+
+template <VariantKind KIND, Type::Code CODE>
+struct BatchedTransposeImplBody {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* array, int32_t n);
+};
+
+template <Type::Code CODE>
+struct _cholesky_supported {
+  static constexpr bool value = CODE == Type::Code::FLOAT64 || CODE == Type::Code::FLOAT32 ||
+                                CODE == Type::Code::COMPLEX64 || CODE == Type::Code::COMPLEX128;
+};
+
+template <VariantKind KIND>
+struct BatchedCholeskyImpl {
+  template <Type::Code CODE, int DIM>
+  void operator()(Array& input_array, Array& output_array) const
+  {
+    using VAL = legate_type_of<CODE>;
+
+    auto shape = input_array.shape<DIM>();
+    if (shape != output_array.shape<DIM>()) {
+      throw legate::TaskException(
+        "Batched cholesky is not supported when input/output shapes differ");
+    }
+
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(shape);
+
+    if (volume == 0) return;
+
+    auto ncols = shape.hi[DIM - 1] - shape.lo[DIM - 1] + 1;
+
+    size_t in_strides[DIM];
+    size_t out_strides[DIM];
+
+    auto input = input_array.read_accessor<VAL, DIM>(shape).ptr(shape, in_strides);
+    if (in_strides[DIM - 2] != ncols || in_strides[DIM - 1] != 1) {
+      throw legate::TaskException(
+        "Bad input accessor in batched cholesky, last two dimensions must be non-transformed and "
+        "dense with stride == 1");
+    }
+
+    auto output = output_array.write_accessor<VAL, DIM>(shape).ptr(shape, out_strides);
+    if (out_strides[DIM - 2] != ncols || out_strides[DIM - 1] != 1) {
+      throw legate::TaskException(
+        "Bad output accessor in batched cholesky, last two dimensions must be non-transformed and "
+        "dense with stride == 1");
+    }
+
+    if (shape.empty()) return;
+
+    int num_blocks = 1;
+    for (int i = 0; i < (DIM - 2); ++i) { num_blocks *= (shape.hi[i] - shape.lo[i] + 1); }
+
+    auto m = static_cast<int32_t>(shape.hi[DIM - 2] - shape.lo[DIM - 2] + 1);
+    auto n = static_cast<int32_t>(shape.hi[DIM - 1] - shape.lo[DIM - 1] + 1);
+    assert(m > 0 && n > 0);
+
+    auto block_stride = m * n;
+
+    for (int i = 0; i < num_blocks; ++i) {
+      if constexpr (_cholesky_supported<CODE>::value) {
+        CopyBlockImpl<KIND>()(output, input, sizeof(VAL) * block_stride);
+        PotrfImplBody<KIND, CODE>()(output, m, n);
+        // Implicit assumption here about the cholesky code created.
+        // We assume the output has C layout, but each subblock
+        // will be generated in Fortran layout. Transpose the Fortran
+        // subblock into C layout.
+        // CHANGE: If this code is changed, please make sure all changes
+        // are consistent with those found in mapper.cc.
+        BatchedTransposeImplBody<KIND, CODE>()(output, n);
+        input += block_stride;
+        output += block_stride;
+      }
+    }
+  }
+};
+
+template <VariantKind KIND>
+static void batched_cholesky_task_context_dispatch(TaskContext& context)
+{
+  auto& batched_input  = context.inputs()[0];
+  auto& batched_output = context.outputs()[0];
+  if (batched_input.code() != batched_output.code()) {
+    throw legate::TaskException(
+      "batched cholesky is not yet supported when input/output types differ");
+  }
+  if (batched_input.dim() != batched_output.dim()) {
+    throw legate::TaskException("input/output have different dims in batched cholesky");
+  }
+  if (batched_input.dim() <= 2) {
+    throw legate::TaskException(
+      "internal error: batched cholesky input does not have more than 2 dims");
+  }
+  double_dispatch(batched_input.dim(),
+                  batched_input.code(),
+                  BatchedCholeskyImpl<KIND>{},
+                  batched_input,
+                  batched_output);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/potrf.cc b/src/cunumeric/matrix/potrf.cc
index 02ae062461..46ed58b6a3 100644
--- a/src/cunumeric/matrix/potrf.cc
+++ b/src/cunumeric/matrix/potrf.cc
@@ -25,48 +25,48 @@ namespace cunumeric {
 using namespace legate;
 
 template <>
-struct PotrfImplBody<VariantKind::CPU, Type::Code::FLOAT32> {
-  void operator()(float* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_spotrf(&uplo, &n, array, &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::CPU, Type::Code::FLOAT32>::operator()(float* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_spotrf(&uplo, &n, array, &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::CPU, Type::Code::FLOAT64> {
-  void operator()(double* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_dpotrf(&uplo, &n, array, &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::CPU, Type::Code::FLOAT64>::operator()(double* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_dpotrf(&uplo, &n, array, &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::CPU, Type::Code::COMPLEX64> {
-  void operator()(complex<float>* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_cpotrf(&uplo, &n, reinterpret_cast<__complex__ float*>(array), &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::CPU, Type::Code::COMPLEX64>::operator()(complex<float>* array,
+                                                                        int32_t m,
+                                                                        int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_cpotrf(&uplo, &n, reinterpret_cast<__complex__ float*>(array), &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::CPU, Type::Code::COMPLEX128> {
-  void operator()(complex<double>* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_zpotrf(&uplo, &n, reinterpret_cast<__complex__ double*>(array), &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::CPU, Type::Code::COMPLEX128>::operator()(complex<double>* array,
+                                                                         int32_t m,
+                                                                         int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_zpotrf(&uplo, &n, reinterpret_cast<__complex__ double*>(array), &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 /*static*/ void PotrfTask::cpu_variant(TaskContext& context)
 {
diff --git a/src/cunumeric/matrix/potrf.cu b/src/cunumeric/matrix/potrf.cu
index 68616525f5..8f13a5168c 100644
--- a/src/cunumeric/matrix/potrf.cu
+++ b/src/cunumeric/matrix/potrf.cu
@@ -49,41 +49,38 @@ static inline void potrf_template(
 }
 
 template <>
-struct PotrfImplBody<VariantKind::GPU, Type::Code::FLOAT32> {
-  void operator()(float* array, int32_t m, int32_t n)
-  {
-    potrf_template(cusolverDnSpotrf_bufferSize, cusolverDnSpotrf, array, m, n);
-  }
-};
+void PotrfImplBody<VariantKind::GPU, Type::Code::FLOAT32>::operator()(float* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  potrf_template(cusolverDnSpotrf_bufferSize, cusolverDnSpotrf, array, m, n);
+}
 
 template <>
-struct PotrfImplBody<VariantKind::GPU, Type::Code::FLOAT64> {
-  void operator()(double* array, int32_t m, int32_t n)
-  {
-    potrf_template(cusolverDnDpotrf_bufferSize, cusolverDnDpotrf, array, m, n);
-  }
-};
+void PotrfImplBody<VariantKind::GPU, Type::Code::FLOAT64>::operator()(double* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  potrf_template(cusolverDnDpotrf_bufferSize, cusolverDnDpotrf, array, m, n);
+}
 
 template <>
-struct PotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX64> {
-  void operator()(complex<float>* array, int32_t m, int32_t n)
-  {
-    potrf_template(
-      cusolverDnCpotrf_bufferSize, cusolverDnCpotrf, reinterpret_cast<cuComplex*>(array), m, n);
-  }
-};
+void PotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX64>::operator()(complex<float>* array,
+                                                                        int32_t m,
+                                                                        int32_t n)
+{
+  potrf_template(
+    cusolverDnCpotrf_bufferSize, cusolverDnCpotrf, reinterpret_cast<cuComplex*>(array), m, n);
+}
 
 template <>
-struct PotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX128> {
-  void operator()(complex<double>* array, int32_t m, int32_t n)
-  {
-    potrf_template(cusolverDnZpotrf_bufferSize,
-                   cusolverDnZpotrf,
-                   reinterpret_cast<cuDoubleComplex*>(array),
-                   m,
-                   n);
-  }
-};
+void PotrfImplBody<VariantKind::GPU, Type::Code::COMPLEX128>::operator()(complex<double>* array,
+                                                                         int32_t m,
+                                                                         int32_t n)
+{
+  potrf_template(
+    cusolverDnZpotrf_bufferSize, cusolverDnZpotrf, reinterpret_cast<cuDoubleComplex*>(array), m, n);
+}
 
 /*static*/ void PotrfTask::gpu_variant(TaskContext& context)
 {
diff --git a/src/cunumeric/matrix/potrf_omp.cc b/src/cunumeric/matrix/potrf_omp.cc
index d26143a6f2..36b32968d1 100644
--- a/src/cunumeric/matrix/potrf_omp.cc
+++ b/src/cunumeric/matrix/potrf_omp.cc
@@ -26,48 +26,48 @@ namespace cunumeric {
 using namespace legate;
 
 template <>
-struct PotrfImplBody<VariantKind::OMP, Type::Code::FLOAT32> {
-  void operator()(float* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_spotrf(&uplo, &n, array, &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::OMP, Type::Code::FLOAT32>::operator()(float* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_spotrf(&uplo, &n, array, &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::OMP, Type::Code::FLOAT64> {
-  void operator()(double* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_dpotrf(&uplo, &n, array, &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::OMP, Type::Code::FLOAT64>::operator()(double* array,
+                                                                      int32_t m,
+                                                                      int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_dpotrf(&uplo, &n, array, &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::OMP, Type::Code::COMPLEX64> {
-  void operator()(complex<float>* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_cpotrf(&uplo, &n, reinterpret_cast<__complex__ float*>(array), &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::OMP, Type::Code::COMPLEX64>::operator()(complex<float>* array,
+                                                                        int32_t m,
+                                                                        int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_cpotrf(&uplo, &n, reinterpret_cast<__complex__ float*>(array), &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 template <>
-struct PotrfImplBody<VariantKind::OMP, Type::Code::COMPLEX128> {
-  void operator()(complex<double>* array, int32_t m, int32_t n)
-  {
-    char uplo    = 'L';
-    int32_t info = 0;
-    LAPACK_zpotrf(&uplo, &n, reinterpret_cast<__complex__ double*>(array), &m, &info);
-    if (info != 0) throw legate::TaskException("Matrix is not positive definite");
-  }
-};
+void PotrfImplBody<VariantKind::OMP, Type::Code::COMPLEX128>::operator()(complex<double>* array,
+                                                                         int32_t m,
+                                                                         int32_t n)
+{
+  char uplo    = 'L';
+  int32_t info = 0;
+  LAPACK_zpotrf(&uplo, &n, reinterpret_cast<__complex__ double*>(array), &m, &info);
+  if (info != 0) throw legate::TaskException("Matrix is not positive definite");
+}
 
 /*static*/ void PotrfTask::omp_variant(TaskContext& context)
 {
diff --git a/src/cunumeric/matrix/potrf_template.inl b/src/cunumeric/matrix/potrf_template.inl
index 55c782ad05..7e42521897 100644
--- a/src/cunumeric/matrix/potrf_template.inl
+++ b/src/cunumeric/matrix/potrf_template.inl
@@ -26,6 +26,26 @@ using namespace legate;
 template <VariantKind KIND, Type::Code CODE>
 struct PotrfImplBody;
 
+template <VariantKind KIND>
+struct PotrfImplBody<KIND, Type::Code::FLOAT32> {
+  void operator()(float* array, int32_t m, int32_t n);
+};
+
+template <VariantKind KIND>
+struct PotrfImplBody<KIND, Type::Code::FLOAT64> {
+  void operator()(double* array, int32_t m, int32_t n);
+};
+
+template <VariantKind KIND>
+struct PotrfImplBody<KIND, Type::Code::COMPLEX64> {
+  void operator()(complex<float>* array, int32_t m, int32_t n);
+};
+
+template <VariantKind KIND>
+struct PotrfImplBody<KIND, Type::Code::COMPLEX128> {
+  void operator()(complex<double>* array, int32_t m, int32_t n);
+};
+
 template <Type::Code CODE>
 struct support_potrf : std::false_type {};
 template <>
diff --git a/tests/integration/test_cholesky.py b/tests/integration/test_cholesky.py
index 91edbaa7ea..5b2659a160 100644
--- a/tests/integration/test_cholesky.py
+++ b/tests/integration/test_cholesky.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -56,10 +56,14 @@ def test_diagonal():
     assert allclose(b**2.0, a)
 
 
+def _get_real_symm_posdef(n):
+    a = num.random.rand(n, n)
+    return a + a.T + num.eye(n) * n
+
+
 @pytest.mark.parametrize("n", SIZES)
 def test_real(n):
-    a = num.random.rand(n, n)
-    b = a + a.T + num.eye(n) * n
+    b = _get_real_symm_posdef(n)
     c = num.linalg.cholesky(b)
     c_np = np.linalg.cholesky(b.__array__())
     assert allclose(c, c_np)
@@ -80,6 +84,45 @@ def test_complex(n):
     assert allclose(c, c_np)
 
 
+@pytest.mark.parametrize("n", SIZES)
+def test_batched_3d(n):
+    batch = 4
+    a = _get_real_symm_posdef(n)
+    np_a = a.__array__()
+    a_batched = num.einsum("i,jk->ijk", np.arange(batch) + 1, a)
+    test_c = num.linalg.cholesky(a_batched)
+    for i in range(batch):
+        correct = np.linalg.cholesky(np_a * (i + 1))
+        test = test_c[i, :]
+        assert allclose(correct, test)
+
+
+def test_batched_empty():
+    batch = 4
+    a = _get_real_symm_posdef(8)
+    a_batched = num.einsum("i,jk->ijk", np.arange(batch) + 1, a)
+    a_sliced = a_batched[0:0, :, :]
+    empty = num.linalg.cholesky(a_sliced)
+    assert empty.shape == a_sliced.shape
+
+
+@pytest.mark.parametrize("n", SIZES)
+def test_batched_4d(n):
+    batch = 2
+    a = _get_real_symm_posdef(n)
+    np_a = a.__array__()
+
+    outer = np.einsum("i,j->ij", np.arange(batch) + 1, np.arange(batch) + 1)
+
+    a_batched = num.einsum("ij,kl->ijkl", outer, a)
+    test_c = num.linalg.cholesky(a_batched)
+    for i in range(batch):
+        for j in range(batch):
+            correct = np.linalg.cholesky(np_a * (i + 1) * (j + 1))
+            test = test_c[i, j, :]
+            assert allclose(correct, test)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/unit/cunumeric/test_config.py b/tests/unit/cunumeric/test_config.py
index 5e85ccfde5..6f8f43df5c 100644
--- a/tests/unit/cunumeric/test_config.py
+++ b/tests/unit/cunumeric/test_config.py
@@ -117,6 +117,7 @@ def test_CuNumericOpCode() -> None:
         "ADVANCED_INDEXING",
         "ARANGE",
         "ARGWHERE",
+        "BATCHED_CHOLESKY",
         "BINARY_OP",
         "BINARY_RED",
         "BINCOUNT",

From 98a73bd44a91419377de8774a47733ce5c630847 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 8 Nov 2023 14:12:57 -0800
Subject: [PATCH 17/18] Remove negative test that's now passing (#1073)

---
 tests/integration/test_cholesky.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/integration/test_cholesky.py b/tests/integration/test_cholesky.py
index 5b2659a160..c4b52754b0 100644
--- a/tests/integration/test_cholesky.py
+++ b/tests/integration/test_cholesky.py
@@ -35,12 +35,6 @@ def test_array_negative_1dim():
         num.linalg.cholesky(arr)
 
 
-def test_array_negative_3dim():
-    arr = num.random.randint(0, 9, size=(3, 3, 3))
-    with pytest.raises(NotImplementedError):
-        num.linalg.cholesky(arr)
-
-
 def test_array_negative():
     arr = num.random.randint(0, 9, size=(3, 2, 3))
     expected_exc = ValueError

From 6ffdc4c5220e85379446c6066549c0b7c109aa40 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Nov 2023 15:11:19 -0800
Subject: [PATCH 18/18] adding v23.11 to documentation (#1074)

---
 docs/cunumeric/source/versions.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/cunumeric/source/versions.rst b/docs/cunumeric/source/versions.rst
index 4a21cc9ef9..1760786d8e 100644
--- a/docs/cunumeric/source/versions.rst
+++ b/docs/cunumeric/source/versions.rst
@@ -11,3 +11,4 @@ Versions
   23.03 <https://nv-legate.github.io/cunumeric/23.03>
   23.07 <https://nv-legate.github.io/cunumeric/23.07>
   23.09 <https://nv-legate.github.io/cunumeric/23.09>
+  23.11 <https://nv-legate.github.io/cunumeric/23.11>