24.06.01 release

manopapad · manopapad · commit 427da00d053c · 2024-09-11T11:20:36.000-07:00
diff --git a/.github/workflows/ci-gh-nightly-release.yml b/.github/workflows/ci-gh-nightly-release.yml
@@ -23,12 +23,17 @@ jobs:
         upload-enabled:
           - true
           - false
+        python-version:
+          - "3.10"
+          - "3.11"
+          - "3.12"
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
       target-device: ${{ matrix.target-device }}
       platform: ${{ matrix.platform }}
       build-type: release
       upload-enabled: ${{ matrix.upload-enabled }}
+      python-version: ${{ matrix.python-version }}
       dependencies-workflow: "ci-gh-nightly-release.yml"
     secrets: inherit
diff --git a/.github/workflows/ci-gh-release.yml b/.github/workflows/ci-gh-release.yml
@@ -25,6 +25,8 @@ jobs:
           - cpu
         upload-enabled:
           - false
+        python-version:
+          - "3.12"
         exclude:
           - platform: linux-aarch64
             target-device: gpu
@@ -35,5 +37,6 @@ jobs:
       platform: ${{ matrix.platform }}
       build-type: release
       upload-enabled: ${{ matrix.upload-enabled }}
+      python-version: ${{ matrix.python-version }}
       dependencies-workflow: "ci-gh-nightly-release.yml"
     secrets: inherit
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
@@ -17,6 +17,10 @@ on:
         required: true
         type: string
         description: The workflow file name used by the dependency
+      python-version:
+        required: false
+        type: string
+        default: "3.12"
 
 jobs:
   setup-build:
@@ -41,9 +45,9 @@ jobs:
 
   build:
     needs: setup-build
-    name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }})"
+    name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, Python ${{ inputs.python-version }})"
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
@@ -53,10 +57,11 @@ jobs:
       platform: ${{ inputs.platform }}
       dependencies-file: "cmake/versions.json"
       dependencies-workflow: ${{ inputs.dependencies-workflow }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
 
 
@@ -65,20 +70,21 @@ jobs:
     if: ${{ github.repository_owner == 'nv-legate' && contains(github.workflow, 'release') && inputs.upload-enabled == true }}
     name: Upload package to Server
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
       name: Upload package to Server
       target-device: ${{ inputs.target-device }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
       upload-action: "upload-package"
       pkgSubString: "cunumeric-"
       repos-Root: "cunumeric"
+      python-version: ${{ inputs.python-version }}
     secrets: inherit    
 
   setup-test:
@@ -150,7 +156,7 @@ jobs:
       matrix: ${{fromJson(needs.setup-test.outputs.matrix)}}
 
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
@@ -160,29 +166,31 @@ jobs:
       has-gpu: ${{ matrix.runner.type == 'gpu' }}
       test-options: ${{ matrix.test-config.test-options }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
 
   updateTestStatus:
     needs: test
     name: Update Test status on Server
     if: ${{ (github.repository_owner == 'nv-legate') && contains(github.workflow, 'Nightly') && (inputs.upload-enabled == true) }}
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
       name: UpdateTestStatus
       target-device: ${{ inputs.target-device }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: true
       upload-action: "update-test-status"
       pkgSubString: "cunumeric-"
-      repos-Root: "cunumeric"  
+      repos-Root: "cunumeric"
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -57,7 +57,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(cunumeric_version 24.05.00)
+set(cunumeric_version 24.06.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes
diff --git a/README.md b/README.md
@@ -52,6 +52,14 @@ or install it into an existing environment:
 conda install -c conda-forge -c legate cunumeric
 ```
 
+In an environment without GPUs available, `conda install` will by default choose a CPU-only package.
+To install a version with GPU support in such an environment, use environment variable `CONDA_OVERRIDE_CUDA`.
+
+```shell
+CONDA_OVERRIDE_CUDA="12.2" \
+  conda install -c conda-forge -c legate legate-core
+```
+
 Once installed, you can verify the installation by running one of the examples
 from the cuNumeric repository, for instance:
 
diff --git a/cmake/versions.json b/cmake/versions.json
@@ -2,12 +2,12 @@
   "packages" : {
     "legate_core" : {
       "repo": "legate.core.internal",
-      "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-${{ inputs.target-device }}-release-<<git_tag>>",
-      "version": "24.05.00",
+      "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-python${{ env.PYTHON_VERSION }}-${{ inputs.target-device }}-release-<<git_tag>>",
+      "version": "24.06.00",
       "git_url" : "git@github.com:nv-legate/legate.core.internal.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "2e1ca409a4f67593aeb859834085919907e9e531"
+      "git_tag" : "6f1c6e55789be286ec8e2e94dc1d95e5dbbc10a2"
     }
   }
 }
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
@@ -84,12 +84,15 @@ build:
     - AWS_SECRET_ACCESS_KEY
 {% if not gpu_enabled_bool %}
     - CPU_ONLY=1
+  # The CPU-only packages having more track_features than the GPU builds helps
+  # the solver to prefer the GPU builds when both are viable candidates.
+  # ref: https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html#track-features
   track_features:
     - cpu_only
 {% endif %}
 
 ignore_run_exports_from:
-  # scikit-build should really be a part of the build env, but then it installs its own Python.  Conda build stacks 
+  # scikit-build should really be a part of the build env, but then it installs its own Python.  Conda build stacks
   # the build environment on the host environment, and the build python takes over causing paths havoc.  So, we put
   # scikit-build into the host env, but we ignore any exports it may bring.
   - scikit-build
@@ -101,12 +104,11 @@ requirements:
     - cmake {{ cmake_version }}
     - {{ compiler('c') }} =11.2
     - {{ compiler('cxx') }} =11.2
-        # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
-
-    - cuda-nvcc ={{ cuda_version }}
+    # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
+    - cuda-nvcc
     # cudart needed for CPU and GPU builds because of curand
-
-    - cuda-cudart-dev ={{ cuda_version }}
+    - cuda-cudart-dev
+    - cuda-version ={{ cuda_version }}
 
 
   host:
@@ -125,19 +127,24 @@ requirements:
 {% else %}
     - legate-core >={{ core_version }} =*_cpu*
 {% endif %}
+    - cuda-version ={{ cuda_version }}
 
   run:
     - numpy {{ numpy_version }}
     - libnvjitlink
     - libcusparse
     - opt_einsum >=3.3
     - scipy
+    - openblas =* =*openmp*
+    # Pin to all minor versions of CUDA newer than the one built against, within the same major version.
+    # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available
+    - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }}
+{% if gpu_enabled_bool %}
+    - __cuda >={{ cuda_version }}
+{% endif %}
 
   run_constrained:
     - __glibc >=2.17  # [linux]
-{% if gpu_enabled_bool %}
-    - __cuda
-{% endif %}
 
 about:
   home: https://github.com/nv-legate/cunumeric
diff --git a/src/cunumeric/item/write.cc b/src/cunumeric/item/write.cc
@@ -21,11 +21,11 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <typename VAL>
-struct WriteImplBody<VariantKind::CPU, VAL> {
-  void operator()(AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1>& value) const
+template <typename VAL, int DIM>
+struct WriteImplBody<VariantKind::CPU, VAL, DIM> {
+  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const
   {
-    out[0] = value[0];
+    out[0] = value[Point<DIM>::ZEROES()];
   }
 };
 
diff --git a/src/cunumeric/item/write.cu b/src/cunumeric/item/write.cu
@@ -20,19 +20,19 @@
 
 namespace cunumeric {
 
-template <typename VAL>
+template <typename VAL, int DIM>
 static __global__ void __launch_bounds__(1, 1)
-  write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1> value)
+  write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, DIM> value)
 {
-  out[0] = value[0];
+  out[0] = value[Point<DIM>::ZEROES()];
 }
 
-template <typename VAL>
-struct WriteImplBody<VariantKind::GPU, VAL> {
-  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, 1>& value) const
+template <typename VAL, int DIM>
+struct WriteImplBody<VariantKind::GPU, VAL, DIM> {
+  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const
   {
     auto stream = get_cached_stream();
-    write_value<VAL><<<1, 1, 0, stream>>>(out, value);
+    write_value<VAL, DIM><<<1, 1, 0, stream>>>(out, value);
     CUNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
diff --git a/src/cunumeric/item/write_template.inl b/src/cunumeric/item/write_template.inl
@@ -23,18 +23,18 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <VariantKind KIND, typename VAL>
+template <VariantKind KIND, typename VAL, int DIM>
 struct WriteImplBody;
 
 template <VariantKind KIND>
 struct WriteImpl {
-  template <Type::Code CODE>
+  template <Type::Code CODE, int DIM>
   void operator()(legate::PhysicalStore out_arr, legate::PhysicalStore in_arr) const
   {
     using VAL = type_of<CODE>;
     auto out  = out_arr.write_accessor<VAL, 1>();
-    auto in   = in_arr.read_accessor<VAL, 1>();
-    WriteImplBody<KIND, VAL>()(out, in);
+    auto in   = in_arr.read_accessor<VAL, DIM>();
+    WriteImplBody<KIND, VAL, DIM>()(out, in);
   }
 };
 
@@ -43,7 +43,8 @@ static void write_template(TaskContext& context)
 {
   auto in  = context.input(0);
   auto out = context.output(0);
-  type_dispatch(out.type().code(), WriteImpl<KIND>{}, out, in);
+  auto dim = std::max(1, in.dim());
+  legate::double_dispatch(dim, out.type().code(), WriteImpl<KIND>(), out, in);
 }
 
 }  // namespace cunumeric
diff --git a/tests/cpp/integration/util.inl b/tests/cpp/integration/util.inl
@@ -15,32 +15,23 @@
  */
 
 namespace {
+
+template <typename T, typename U = void>
+struct has_operator_left_shift : std::false_type {};
+
 template <typename T>
-std::stringstream& print_value(std::stringstream& ss, T value)
-{
-  ss << value;
-  return ss;
-}
+struct has_operator_left_shift<T, std::void_t<decltype(std::cout << std::declval<T>())>>
+  : std::true_type {};
 
-template <>
-std::stringstream& print_value<complex<float>>(std::stringstream& ss, complex<float> value)
-{
-  // operator<< missing for cuda::std::complex
-  // The issue is going to be fixed in the next cuda release.
-#if CUDART_VERSION >= 12050
-  ss << value;
-#endif
-  return ss;
-}
+template <typename T>
+constexpr bool has_operator_left_shift_v = has_operator_left_shift<T>::value;
 
-template <>
-std::stringstream& print_value<complex<double>>(std::stringstream& ss, complex<double> value)
+template <typename T>
+std::stringstream& print_value(std::stringstream& ss, T value)
 {
-  // operator<< missing for cuda::std::complex
-  // The issue is going to be fixed in the next cuda release.
-#if CUDART_VERSION >= 12050
-  ss << value;
-#endif
+  if constexpr (has_operator_left_shift_v<T>) {
+    ss << value;
+  }
   return ss;
 }
 
diff --git a/tests/integration/test_singleton_access.py b/tests/integration/test_singleton_access.py
@@ -64,11 +64,11 @@ def array_gen(lib):
         yield arr
     for arr in nonscalar_gen(lib):
         idx_tuple = arr.ndim * (2,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1,), -1)
         yield arr
     for arr in nonscalar_gen(lib):
         idx_tuple = arr.ndim * (2,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1, 1), -1)
         yield arr
     # set single item on scalar array
     for arr in scalar_gen(lib, 42):
@@ -77,11 +77,11 @@ def array_gen(lib):
         yield arr
     for arr in scalar_gen(lib, 42):
         idx_tuple = arr.ndim * (0,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1,), -1)
         yield arr
     for arr in scalar_gen(lib, 42):
         idx_tuple = arr.ndim * (0,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1, 1), -1)
         yield arr
     # set "multiple" items on scalar array
     for arr in scalar_gen(lib, 42):

Original file line number	Diff line number	Diff line change
`@@ -2,12 +2,12 @@`
`2`	`2`	`"packages" : {`
`3`	`3`	`"legate_core" : {`
`4`	`4`	`"repo": "legate.core.internal",`
`5`		`- "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-${{ inputs.target-device }}-release-<<git_tag>>",`
`6`		`- "version": "24.05.00",`
	`5`	`+ "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-python${{ env.PYTHON_VERSION }}-${{ inputs.target-device }}-release-<<git_tag>>",`
	`6`	`+ "version": "24.06.00",`
`7`	`7`	`"git_url" : "git@github.com:nv-legate/legate.core.internal.git",`
`8`	`8`	`"git_shallow": false,`
`9`	`9`	`"always_download": false,`
`10`		`- "git_tag" : "2e1ca409a4f67593aeb859834085919907e9e531"`
	`10`	`+ "git_tag" : "6f1c6e55789be286ec8e2e94dc1d95e5dbbc10a2"`
`11`	`11`	`}`
`12`	`12`	`}`
`13`	`13`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,11 +21,11 @@ namespace cunumeric {`
`21`	`21`
`22`	`22`	`using namespace legate;`
`23`	`23`
`24`		`-template <typename VAL>`
`25`		`-struct WriteImplBody<VariantKind::CPU, VAL> {`
`26`		`- void operator()(AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1>& value) const`
	`24`	`+template <typename VAL, int DIM>`
	`25`	`+struct WriteImplBody<VariantKind::CPU, VAL, DIM> {`
	`26`	`+ void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const`
`27`	`27`	`{`
`28`		`- out[0] = value[0];`
	`28`	`+ out[0] = value[Point<DIM>::ZEROES()];`
`29`	`29`	`}`
`30`	`30`	`};`
`31`	`31`
Original file line number	Diff line number	Diff line change
`@@ -20,19 +20,19 @@`
`20`	`20`
`21`	`21`	`namespace cunumeric {`
`22`	`22`
`23`		`-template <typename VAL>`
	`23`	`+template <typename VAL, int DIM>`
`24`	`24`	`static __global__ void __launch_bounds__(1, 1)`
`25`		`- write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1> value)`
	`25`	`+ write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, DIM> value)`
`26`	`26`	`{`
`27`		`- out[0] = value[0];`
	`27`	`+ out[0] = value[Point<DIM>::ZEROES()];`
`28`	`28`	`}`
`29`	`29`
`30`		`-template <typename VAL>`
`31`		`-struct WriteImplBody<VariantKind::GPU, VAL> {`
`32`		`- void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, 1>& value) const`
	`30`	`+template <typename VAL, int DIM>`
	`31`	`+struct WriteImplBody<VariantKind::GPU, VAL, DIM> {`
	`32`	`+ void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const`
`33`	`33`	`{`
`34`	`34`	`auto stream = get_cached_stream();`
`35`		`- write_value<VAL><<<1, 1, 0, stream>>>(out, value);`
	`35`	`+ write_value<VAL, DIM><<<1, 1, 0, stream>>>(out, value);`
`36`	`36`	`CUNUMERIC_CHECK_CUDA_STREAM(stream);`
`37`	`37`	`}`
`38`	`38`	`};`