From fa7adb0280f4a9a47a0ede5b91e6c3bfc1c73a65 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Thu, 23 May 2024 15:32:54 -0700
Subject: [PATCH 01/11] update requirements to use new cellxgene-schema 5.1

---
 python_dependencies/processing/requirements.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/python_dependencies/processing/requirements.txt b/python_dependencies/processing/requirements.txt
index 5396acfd9dc69..31e8a22bf0f5b 100644
--- a/python_dependencies/processing/requirements.txt
+++ b/python_dependencies/processing/requirements.txt
@@ -2,12 +2,13 @@ anndata==0.8.0
 awscli
 boto3>=1.11.17
 botocore>=1.14.17
-cellxgene-schema
+# install the cellxgene-schema version from latest commit on the main branch from github
+git+https://github.com/chanzuckerberg/single-cell-curation.git@main#egg=cellxgene-schema&subdirectory=cellxgene_schema_cli
 dataclasses-json
 ddtrace==2.1.4
-numba==0.56.2
-numpy==1.23.2
-pandas==1.4.4
+numba
+numpy
+pandas
 psutil>=5.9.0
 psycopg2-binary==2.*
 pyarrow>=1.0
@@ -18,7 +19,7 @@ requests>=2.22.0
 rpy2==3.5.16
 rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability
 s3fs==0.4.2
-scanpy==1.9.3
+scanpy
 SQLAlchemy==2.*
 tenacity
 tiledb==0.25.0  # Portal's tiledb version should always be the same or older than Explorer's

From cd4754f3a31201e205ae3338e646527475d7b9a9 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Thu, 23 May 2024 16:32:23 -0700
Subject: [PATCH 02/11] fix tests

---
 tests/unit/processing/test_h5ad_data_file.py       | 4 ++--
 tests/unit/processing/test_spatial_assets_utils.py | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/unit/processing/test_h5ad_data_file.py b/tests/unit/processing/test_h5ad_data_file.py
index 69f13aff4333b..fdb1a4967fd40 100644
--- a/tests/unit/processing/test_h5ad_data_file.py
+++ b/tests/unit/processing/test_h5ad_data_file.py
@@ -170,7 +170,7 @@ def test__slash_in_attribute_name(self):
 
         col_name = "fo/o"
 
-        attrs = [tiledb.Attr(name=col_name, dtype=np.int)]
+        attrs = [tiledb.Attr(name=col_name, dtype=int)]
         domain = tiledb.Domain(tiledb.Dim(domain=(0, 99), tile=100, dtype=np.uint32))
         schema = tiledb.ArraySchema(
             domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major"
@@ -180,7 +180,7 @@ def test__slash_in_attribute_name(self):
         try:
             with tiledb.open("foo", mode="w") as A:
                 value = dict()
-                value[col_name] = np.zeros((100,), dtype=np.int)
+                value[col_name] = np.zeros((100,), dtype=int)
                 A[:] = value  # if there's a regression, this statement will throw a TileDBError
                 # if we get here we're good
         finally:
diff --git a/tests/unit/processing/test_spatial_assets_utils.py b/tests/unit/processing/test_spatial_assets_utils.py
index c74a8c2cf0e0d..7bc0a9f960bc6 100644
--- a/tests/unit/processing/test_spatial_assets_utils.py
+++ b/tests/unit/processing/test_spatial_assets_utils.py
@@ -236,7 +236,7 @@ def test__upload_assets_failure(spatial_processor, output_folder, mocker):
     mock_upload.assert_called_once_with(output_folder, expected_s3_uri)
 
 
-def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatial_data, mocker):
+def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatial_data, mocker, tmpdir):
     mock_fetch_image = mocker.patch.object(spatial_processor, "_fetch_image")
     mock_process_and_flip_image = mocker.patch.object(spatial_processor, "_process_and_flip_image")
     mock_generate_deep_zoom_assets = mocker.patch.object(spatial_processor, "_generate_deep_zoom_assets")
@@ -244,8 +244,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia
 
     # mock the TemporaryDirectory context manager
     mock_temp_dir = mocker.patch("tempfile.TemporaryDirectory")
-    temp_dir_name = "/mock/temp/dir"
-    mock_temp_dir.return_value.__enter__.return_value = temp_dir_name
+    mock_temp_dir.return_value.__enter__.return_value = tmpdir
 
     # mock return values for the internal methods
     mock_fetch_image.return_value = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)
@@ -254,7 +253,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia
     # call the method under test
     spatial_processor.create_deep_zoom_assets(cxg_container, valid_spatial_data)
 
-    assets_folder = os.path.join(temp_dir_name, cxg_container.replace(".cxg", ""))
+    assets_folder = os.path.join(tmpdir, cxg_container.replace(".cxg", ""))
 
     # assertions to ensure each step is called
     mock_fetch_image.assert_called_once_with(valid_spatial_data)

From a00aad2ea8c89a79059ad09fcc146e03f5702765 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Tue, 28 May 2024 16:33:07 -0700
Subject: [PATCH 03/11] pass can_publish using s3

---
 backend/layers/processing/schema_migration.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 68afe158f5c7a..5d9c7f95d2399 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -174,7 +174,6 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can
                 # ^^^ The top level fields are used for handling error cases in the AWS SFN.
                 "datasets": [
                     {
-                        "can_publish": str(can_publish),
                         "collection_id": collection_id,
                         "collection_url": collection_url,
                         "collection_version_id": private_collection_version_id,
@@ -196,7 +195,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can
         self._store_sfn_response("publish_and_cleanup", version.collection_id.id, response)
         return response
 
-    def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) -> list:
+    def publish_and_cleanup(self, collection_version_id: str) -> list:
         errors = []
         collection_version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id))
         object_keys_to_delete = []
@@ -259,7 +258,7 @@ def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) ->
         self.s3_provider.delete_files(self.artifact_bucket, object_keys_to_delete)
         if errors:
             self._store_sfn_response("report/errors", collection_version_id, errors)
-        elif can_publish:
+        elif extra_info["can_publish"] == "true":
             self.business_logic.publish_collection_version(collection_version.version_id)
         return errors
 
@@ -381,9 +380,8 @@ def migrate(self, step_name) -> bool:
             )
         elif step_name == "collection_publish":
             collection_version_id = os.environ["COLLECTION_VERSION_ID"]
-            can_publish = os.environ["CAN_PUBLISH"].lower() == "true"
             publish_and_cleanup = self.error_wrapper(self.publish_and_cleanup, collection_version_id)
-            response = publish_and_cleanup(collection_version_id=collection_version_id, can_publish=can_publish)
+            response = publish_and_cleanup(collection_version_id=collection_version_id)
         elif step_name == "report":
             response = self.report()
         self.logger.info("output", extra={"response": response})

From 98a2ecf03b4ebc0cb451f025e04479ed8889e5ab Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Tue, 28 May 2024 16:33:23 -0700
Subject: [PATCH 04/11] pass can_publish using s3

---
 .happy/terraform/modules/schema_migration/main.tf | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf
index 07025411f412c..f1ce7d083cc4e 100644
--- a/.happy/terraform/modules/schema_migration/main.tf
+++ b/.happy/terraform/modules/schema_migration/main.tf
@@ -353,10 +353,6 @@ resource aws_sfn_state_machine sfn_schema_migration {
                     "Name": "COLLECTION_VERSION_ID",
                     "Value.$": "$.collection_version_id"
                   },
-                  {
-                    "Name": "CAN_PUBLISH",
-                    "Value.$": "$.can_publish"
-                  },
                   {
                     "Name": "TASK_TOKEN",
                     "Value.$": "$$.Task.Token"

From 5f21af208698a60b1c04074b39a99718ffa53d58 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Tue, 28 May 2024 16:39:40 -0700
Subject: [PATCH 05/11] don't pass the error to future steps

---
 .happy/terraform/modules/schema_migration/main.tf | 10 +++++-----
 backend/layers/processing/schema_migration.py     |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf
index f1ce7d083cc4e..bdee78f14768c 100644
--- a/.happy/terraform/modules/schema_migration/main.tf
+++ b/.happy/terraform/modules/schema_migration/main.tf
@@ -313,7 +313,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                 "ErrorEquals": [
                   "States.ALL"
                 ],
-                "ResultPath": "$.error",
+                "ResultPath": null,
                 "Next": "CollectionPublish"
               }
             ]
@@ -377,7 +377,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                   "States.ALL"
                 ],
                 "Next": "CollectionError",
-                "ResultPath": "$.error"
+                "ResultPath": null
               }
             ]
           },
@@ -450,7 +450,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                         "States.ALL"
                       ],
                       "Next": "DatasetError",
-                      "ResultPath": "$.error"
+                      "ResultPath": null
                     }
                   ],
                   "ResultPath": "$.result"
@@ -480,7 +480,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                         "States.ALL"
                       ],
                       "Next": "DatasetError",
-                      "ResultPath": "$.error"
+                      "ResultPath": null
                     }
                   ],
                   "ResultPath": "$.result"
@@ -496,7 +496,7 @@ resource aws_sfn_state_machine sfn_schema_migration {
                   "States.ALL"
                 ],
                 "Next": "CollectionPublish",
-                "ResultPath": "$.error"
+                "ResultPath": null
               }
             ],
             "OutputPath": "$[0]",
diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 5d9c7f95d2399..0c823e7764e56 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -66,7 +66,8 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]:
             _resp = {}
             if collection.is_published() and collection.collection_id.id in has_revision:
                 continue
-
+            if collection.collection_id.id not in ["0aab20b3-c30c-4606-bd2e-d20dae739c45"]:
+                continue
             if collection.is_published():
                 # published collection without an active revision
                 _resp["can_publish"] = str(True)

From a126899c65488b3a9f5831dbfe23ab129f71a674 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Tue, 28 May 2024 17:00:23 -0700
Subject: [PATCH 06/11] undo can_publish

---
 .happy/terraform/modules/schema_migration/main.tf         | 4 ++++
 backend/layers/processing/schema_migration.py             | 8 +++++---
 .../schema_migration/test_publish_and_cleanup.py          | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf
index bdee78f14768c..1bc5f4e06eef6 100644
--- a/.happy/terraform/modules/schema_migration/main.tf
+++ b/.happy/terraform/modules/schema_migration/main.tf
@@ -353,6 +353,10 @@ resource aws_sfn_state_machine sfn_schema_migration {
                     "Name": "COLLECTION_VERSION_ID",
                     "Value.$": "$.collection_version_id"
                   },
+                  {
+                    "Name": "CAN_PUBLISH",
+                    "Value.$": "$.can_publish"
+                  },
                   {
                     "Name": "TASK_TOKEN",
                     "Value.$": "$$.Task.Token"
diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 0c823e7764e56..fb279464c3a41 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -175,6 +175,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can
                 # ^^^ The top level fields are used for handling error cases in the AWS SFN.
                 "datasets": [
                     {
+                        "can_publish": str(can_publish),
                         "collection_id": collection_id,
                         "collection_url": collection_url,
                         "collection_version_id": private_collection_version_id,
@@ -196,7 +197,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can
         self._store_sfn_response("publish_and_cleanup", version.collection_id.id, response)
         return response
 
-    def publish_and_cleanup(self, collection_version_id: str) -> list:
+    def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) -> list:
         errors = []
         collection_version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id))
         object_keys_to_delete = []
@@ -259,7 +260,7 @@ def publish_and_cleanup(self, collection_version_id: str) -> list:
         self.s3_provider.delete_files(self.artifact_bucket, object_keys_to_delete)
         if errors:
             self._store_sfn_response("report/errors", collection_version_id, errors)
-        elif extra_info["can_publish"] == "true":
+        elif can_publish:
             self.business_logic.publish_collection_version(collection_version.version_id)
         return errors
 
@@ -381,8 +382,9 @@ def migrate(self, step_name) -> bool:
             )
         elif step_name == "collection_publish":
             collection_version_id = os.environ["COLLECTION_VERSION_ID"]
+            can_publish = os.environ["CAN_PUBLISH"].lower() == "true"
             publish_and_cleanup = self.error_wrapper(self.publish_and_cleanup, collection_version_id)
-            response = publish_and_cleanup(collection_version_id=collection_version_id)
+            response = publish_and_cleanup(collection_version_id=collection_version_id, can_publish=can_publish)
         elif step_name == "report":
             response = self.report()
         self.logger.info("output", extra={"response": response})
diff --git a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py b/tests/unit/processing/schema_migration/test_publish_and_cleanup.py
index bc2ef2c0fe2ca..06cc3f347412f 100644
--- a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py
+++ b/tests/unit/processing/schema_migration/test_publish_and_cleanup.py
@@ -135,7 +135,7 @@ def test_can_not_publish(self, mock_json, local_schema_migrate):
             ]
         )
         local_schema_migrate.business_logic.get_collection_version.return_value = collection_version
-        errors = local_schema_migrate.publish_and_cleanup(collection_version.version_id.id, False)
+        errors = local_schema_migrate.publish_and_cleanup(collection_version.version_id.id)
         assert errors == []
         local_schema_migrate.business_logic.publish_collection_version.assert_not_called()
         local_schema_migrate.s3_provider.delete_files.assert_any_call(

From 0dca596cf1169a152cb697db249ae1591d6ade24 Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Wed, 29 May 2024 11:22:32 -0400
Subject: [PATCH 07/11] filter by spatial collection id

---
 backend/layers/processing/schema_migration.py | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 68afe158f5c7a..355f9d4173bf3 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -62,11 +62,34 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]:
 
         has_revision = set()
         # iterates over unpublished collections first, so published versions are skipped if there is an active revision
+
+        # temp: filter for Collections containing visium and slide-seq-v2 datasets
+        COLLECTIONS = [
+            "e68d20e4-5c51-4f6c-b599-30a08858c5ce",
+            "1dbbd47f-80be-4048-8516-c9fa50cb4890",
+            "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
+            "362505ba-aff6-4589-a407-667a9e18bf8a",
+            "8dac14c9-e100-45c7-8ace-051ac2b7737a",
+            "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
+            "8fcfd5ec-e671-4836-9a96-52110e11db32",
+            "790b6cbd-2cec-4782-8173-08d53e6afc43",
+            "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
+            "d74b6979-efba-47cd-990a-9d80ccf29055",
+            "02b01703-bf1b-48de-b99a-23bef8cccc81",
+            "8e880741-bf9a-4c8e-9227-934204631d2a",
+            "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
+            "21bbfaec-6958-46bc-b1cd-1535752f6304",
+            "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
+            "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
+            "0aab20b3-c30c-4606-bd2e-d20dae739c45",
+        ]
+
         for collection in self.fetch_collections():
             _resp = {}
             if collection.is_published() and collection.collection_id.id in has_revision:
                 continue
-
+            if collection.collection_id.id not in COLLECTIONS:
+                continue
             if collection.is_published():
                 # published collection without an active revision
                 _resp["can_publish"] = str(True)

From c411aed30071e7403ce9538fbafdcd1c22600c79 Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Wed, 29 May 2024 13:15:30 -0400
Subject: [PATCH 08/11] filter by spatial collection id

---
 backend/layers/processing/schema_migration.py | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 355f9d4173bf3..2fdaf19056bd3 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -66,22 +66,22 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]:
         # temp: filter for Collections containing visium and slide-seq-v2 datasets
         COLLECTIONS = [
             "e68d20e4-5c51-4f6c-b599-30a08858c5ce",
-            "1dbbd47f-80be-4048-8516-c9fa50cb4890",
-            "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
-            "362505ba-aff6-4589-a407-667a9e18bf8a",
-            "8dac14c9-e100-45c7-8ace-051ac2b7737a",
-            "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
-            "8fcfd5ec-e671-4836-9a96-52110e11db32",
-            "790b6cbd-2cec-4782-8173-08d53e6afc43",
-            "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
-            "d74b6979-efba-47cd-990a-9d80ccf29055",
-            "02b01703-bf1b-48de-b99a-23bef8cccc81",
-            "8e880741-bf9a-4c8e-9227-934204631d2a",
-            "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
-            "21bbfaec-6958-46bc-b1cd-1535752f6304",
-            "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
-            "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
-            "0aab20b3-c30c-4606-bd2e-d20dae739c45",
+            # "1dbbd47f-80be-4048-8516-c9fa50cb4890",
+            # "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
+            # "362505ba-aff6-4589-a407-667a9e18bf8a",
+            # "8dac14c9-e100-45c7-8ace-051ac2b7737a",
+            # "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
+            # "8fcfd5ec-e671-4836-9a96-52110e11db32",
+            # "790b6cbd-2cec-4782-8173-08d53e6afc43",
+            # "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
+            # "d74b6979-efba-47cd-990a-9d80ccf29055",
+            # "02b01703-bf1b-48de-b99a-23bef8cccc81",
+            # "8e880741-bf9a-4c8e-9227-934204631d2a",
+            # "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
+            # "21bbfaec-6958-46bc-b1cd-1535752f6304",
+            # "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
+            # "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
+            # "0aab20b3-c30c-4606-bd2e-d20dae739c45",
         ]
 
         for collection in self.fetch_collections():

From 5d00c6fe1c19bfd27129cce7e6031ef2afa591ff Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Wed, 29 May 2024 13:16:46 -0400
Subject: [PATCH 09/11] Revert "filter by spatial collection id"

This reverts commit c411aed30071e7403ce9538fbafdcd1c22600c79.
---
 backend/layers/processing/schema_migration.py | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 2fdaf19056bd3..355f9d4173bf3 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -66,22 +66,22 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]:
         # temp: filter for Collections containing visium and slide-seq-v2 datasets
         COLLECTIONS = [
             "e68d20e4-5c51-4f6c-b599-30a08858c5ce",
-            # "1dbbd47f-80be-4048-8516-c9fa50cb4890",
-            # "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
-            # "362505ba-aff6-4589-a407-667a9e18bf8a",
-            # "8dac14c9-e100-45c7-8ace-051ac2b7737a",
-            # "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
-            # "8fcfd5ec-e671-4836-9a96-52110e11db32",
-            # "790b6cbd-2cec-4782-8173-08d53e6afc43",
-            # "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
-            # "d74b6979-efba-47cd-990a-9d80ccf29055",
-            # "02b01703-bf1b-48de-b99a-23bef8cccc81",
-            # "8e880741-bf9a-4c8e-9227-934204631d2a",
-            # "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
-            # "21bbfaec-6958-46bc-b1cd-1535752f6304",
-            # "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
-            # "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
-            # "0aab20b3-c30c-4606-bd2e-d20dae739c45",
+            "1dbbd47f-80be-4048-8516-c9fa50cb4890",
+            "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
+            "362505ba-aff6-4589-a407-667a9e18bf8a",
+            "8dac14c9-e100-45c7-8ace-051ac2b7737a",
+            "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
+            "8fcfd5ec-e671-4836-9a96-52110e11db32",
+            "790b6cbd-2cec-4782-8173-08d53e6afc43",
+            "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
+            "d74b6979-efba-47cd-990a-9d80ccf29055",
+            "02b01703-bf1b-48de-b99a-23bef8cccc81",
+            "8e880741-bf9a-4c8e-9227-934204631d2a",
+            "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
+            "21bbfaec-6958-46bc-b1cd-1535752f6304",
+            "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
+            "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
+            "0aab20b3-c30c-4606-bd2e-d20dae739c45",
         ]
 
         for collection in self.fetch_collections():

From 43bf26f7c05d0f236e178794714a7607d29c8b13 Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Fri, 31 May 2024 10:36:06 -0700
Subject: [PATCH 10/11] add comments

---
 backend/layers/processing/schema_migration.py | 44 ++++++++++---------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py
index 5d7d0b57e0cbb..c888a0538dec9 100644
--- a/backend/layers/processing/schema_migration.py
+++ b/backend/layers/processing/schema_migration.py
@@ -66,32 +66,34 @@ def gather_collections(self, auto_publish: bool) -> Tuple[Dict[str, str], Dict[s
         # iterates over unpublished collections first, so published versions are skipped if there is an active revision
 
         # temp: filter for Collections containing visium and slide-seq-v2 datasets
-        COLLECTIONS = [
-            "e68d20e4-5c51-4f6c-b599-30a08858c5ce",
-            "1dbbd47f-80be-4048-8516-c9fa50cb4890",
-            "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
-            "362505ba-aff6-4589-a407-667a9e18bf8a",
-            "8dac14c9-e100-45c7-8ace-051ac2b7737a",
-            "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
-            "8fcfd5ec-e671-4836-9a96-52110e11db32",
-            "790b6cbd-2cec-4782-8173-08d53e6afc43",
-            "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
-            "d74b6979-efba-47cd-990a-9d80ccf29055",
-            "02b01703-bf1b-48de-b99a-23bef8cccc81",
-            "8e880741-bf9a-4c8e-9227-934204631d2a",
-            "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
-            "21bbfaec-6958-46bc-b1cd-1535752f6304",
-            "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
-            "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
-            "0aab20b3-c30c-4606-bd2e-d20dae739c45",
-        ]
+        # TODO remove commented sectection bellow
+        # COLLECTIONS = [
+        #     "e68d20e4-5c51-4f6c-b599-30a08858c5ce",
+        #     "1dbbd47f-80be-4048-8516-c9fa50cb4890",
+        #     "d1c3e6e4-950e-4a5f-83b4-8de13153299a",
+        #     "362505ba-aff6-4589-a407-667a9e18bf8a",
+        #     "8dac14c9-e100-45c7-8ace-051ac2b7737a",
+        #     "19b54e58-8e71-497b-ad03-aa9a6ca52ce0",
+        #     "8fcfd5ec-e671-4836-9a96-52110e11db32",
+        #     "790b6cbd-2cec-4782-8173-08d53e6afc43",
+        #     "e0612e2e-0867-4773-bff9-b5e08b2a77e7",
+        #     "d74b6979-efba-47cd-990a-9d80ccf29055",
+        #     "02b01703-bf1b-48de-b99a-23bef8cccc81",
+        #     "8e880741-bf9a-4c8e-9227-934204631d2a",
+        #     "a96133de-e951-4e2d-ace6-59db8b3bfb1d",
+        #     "21bbfaec-6958-46bc-b1cd-1535752f6304",
+        #     "2d69ef57-864d-4f58-93f5-4fbd6fab06a6",
+        #     "7c1fbbae-5f69-4e3e-950d-d819466aecb2",
+        #     "0aab20b3-c30c-4606-bd2e-d20dae739c45",
+        # ]
 
         for collection in self.fetch_collections():
             _resp = {}
             if collection.is_published() and collection.collection_id.id in has_revision:
                 continue
-            if collection.collection_id.id not in COLLECTIONS:
-                continue
+            # if collection.collection_id.id not in COLLECTIONS:
+            #     continue
+            # TODO: remove commented section
             if collection.is_published():
                 # published collection without an active revision
                 _resp["can_publish"] = str(True)

From a114eeee8f67e722444ba4121f8669dbc028c9fd Mon Sep 17 00:00:00 2001
From: Bento007 <trent.smith007@gmail.com>
Date: Fri, 31 May 2024 11:46:18 -0700
Subject: [PATCH 11/11] Fix tests

---
 backend/layers/processing/h5ad_data_file.py       |  6 ++++--
 backend/layers/processing/process_cxg.py          |  4 ++--
 backend/layers/processing/process_logic.py        |  2 +-
 backend/layers/processing/process_seurat.py       |  2 +-
 .../processing/utils/cxg_generation_utils.py      |  4 ++--
 backend/layers/processing/utils/spatial.py        | 11 +++++++----
 tests/unit/processing/test_h5ad_data_file.py      | 15 ++++++++-------
 7 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/backend/layers/processing/h5ad_data_file.py b/backend/layers/processing/h5ad_data_file.py
index c060c107cabec..50424d32199fe 100644
--- a/backend/layers/processing/h5ad_data_file.py
+++ b/backend/layers/processing/h5ad_data_file.py
@@ -54,7 +54,9 @@ def __init__(
 
         self.validate_anndata()
 
-    def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_to_cxg_colors=True):
+    def to_cxg(
+        self, output_cxg_directory, sparse_threshold, dataset_version_id, convert_anndata_colors_to_cxg_colors=True
+    ):
         """
         Writes the following attributes of the anndata to CXG: 1) the metadata as metadata attached to an empty
         DenseArray, 2) the obs DataFrame as a DenseArray, 3) the var DataFrame as a DenseArray, 4) all valid
@@ -80,7 +82,7 @@ def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_
         convert_dataframe_to_cxg_array(output_cxg_directory, "var", self.var, self.var_index_column_name, ctx)
         logging.info("\t...dataset var dataframe saved")
 
-        convert_uns_to_cxg_group(output_cxg_directory, self.anndata.uns, "uns", ctx)
+        convert_uns_to_cxg_group(output_cxg_directory, self.anndata.uns, dataset_version_id, "uns", ctx)
         logging.info("\t...dataset uns dataframe saved")
 
         self.write_anndata_embeddings_to_cxg(output_cxg_directory, ctx)
diff --git a/backend/layers/processing/process_cxg.py b/backend/layers/processing/process_cxg.py
index 864cc4e470dc4..64c2946153fcf 100644
--- a/backend/layers/processing/process_cxg.py
+++ b/backend/layers/processing/process_cxg.py
@@ -67,7 +67,7 @@ def process(
         self.process_cxg(labeled_h5ad_filename, dataset_version_id, cellxgene_bucket, current_artifacts)
 
     @logit
-    def make_cxg(self, local_filename):
+    def make_cxg(self, local_filename, dataset_version_id):
         """
         Convert the uploaded H5AD file to the CXG format servicing the cellxgene Explorer.
         """
@@ -75,7 +75,7 @@ def make_cxg(self, local_filename):
         cxg_output_container = local_filename.replace(".h5ad", ".cxg")
         try:
             h5ad_data_file = H5ADDataFile(local_filename, var_index_column_name="feature_name")
-            h5ad_data_file.to_cxg(cxg_output_container, sparse_threshold=25.0)
+            h5ad_data_file.to_cxg(cxg_output_container, sparse_threshold=25.0, dataset_version_id=dataset_version_id.id)
         except Exception as ex:
             # TODO use a specialized exception
             msg = "CXG conversion failed."
diff --git a/backend/layers/processing/process_logic.py b/backend/layers/processing/process_logic.py
index c85541510b6b4..338948bbf6681 100644
--- a/backend/layers/processing/process_logic.py
+++ b/backend/layers/processing/process_logic.py
@@ -114,7 +114,7 @@ def convert_file(
         start = datetime.now()
         try:
             self.update_processing_status(dataset_version_id, processing_status_key, DatasetConversionStatus.CONVERTING)
-            file_dir = converter(local_filename)
+            file_dir = converter(local_filename, dataset_version_id)
             self.update_processing_status(dataset_version_id, processing_status_key, DatasetConversionStatus.CONVERTED)
             self.logger.info(f"Finished converting {converter} in {datetime.now() - start}")
         except Exception:
diff --git a/backend/layers/processing/process_seurat.py b/backend/layers/processing/process_seurat.py
index 7a7b31a776a57..3156fdb8de652 100644
--- a/backend/layers/processing/process_seurat.py
+++ b/backend/layers/processing/process_seurat.py
@@ -96,7 +96,7 @@ def process(self, dataset_version_id: DatasetVersionId, artifact_bucket: str, da
         )
 
     @logit
-    def make_seurat(self, local_filename):
+    def make_seurat(self, local_filename, *args, **kwargs):
         """
         Create a Seurat rds file from the AnnData file.
         """
diff --git a/backend/layers/processing/utils/cxg_generation_utils.py b/backend/layers/processing/utils/cxg_generation_utils.py
index 3d46f0cda13e8..78a104efa4f5c 100644
--- a/backend/layers/processing/utils/cxg_generation_utils.py
+++ b/backend/layers/processing/utils/cxg_generation_utils.py
@@ -34,7 +34,7 @@ def convert_dictionary_to_cxg_group(cxg_container, metadata_dict, group_metadata
             metadata_array.meta[key] = value
 
 
-def convert_uns_to_cxg_group(cxg_container, metadata_dict, group_metadata_name="uns", ctx=None):
+def convert_uns_to_cxg_group(cxg_container, metadata_dict, dataset_version_id, group_metadata_name="uns", ctx=None):
     """
     Convert uns (unstructured) metadata to CXG output directory specified
     Generate deep zoom assets for spatial data
@@ -53,7 +53,7 @@ def convert_uns_to_cxg_group(cxg_container, metadata_dict, group_metadata_name="
                 for object_id, content in value.items():
                     if object_id not in SPATIAL_KEYS_EXCLUDE:
                         object_filtered = spatial_processor.filter_spatial_data(content, object_id)
-                        spatial_processor.create_deep_zoom_assets(cxg_container, content)
+                        spatial_processor.create_deep_zoom_assets(cxg_container, content, dataset_version_id)
 
                 metadata_array.meta[key] = pickle.dumps(object_filtered)
 
diff --git a/backend/layers/processing/utils/spatial.py b/backend/layers/processing/utils/spatial.py
index 2acef734bb1bb..6e29c73db9aaa 100644
--- a/backend/layers/processing/utils/spatial.py
+++ b/backend/layers/processing/utils/spatial.py
@@ -116,23 +116,26 @@ def _generate_deep_zoom_assets(self, image_array, assets_folder):
         image = pyvips.Image.new_from_memory(linear.data, w, h, bands, "uchar")
         image.dzsave(os.path.join(assets_folder, "spatial"), suffix=".jpeg")
 
-    def _upload_assets(self, assets_folder):
+    def _upload_assets(self, assets_folder, dataset_version_id):
         """
         Upload the deep zoom assets to the S3 bucket.
 
         Args:
             assets_folder (str): The folder containing the assets.
+            dataset_version_id (str): The UUID uniquely identifying the dataset version.
         """
-        s3_uri = f"s3://{self.bucket_name}/{self.asset_directory}/{os.path.basename(assets_folder)}"
+        version_id = dataset_version_id.replace(".cxg", "")
+        s3_uri = f"s3://{self.bucket_name}/{self.asset_directory}/{version_id}"
         self.s3_provider.upload_directory(assets_folder, s3_uri)
 
-    def create_deep_zoom_assets(self, container_name, content):
+    def create_deep_zoom_assets(self, container_name, content, dataset_version_id):
         """
         Create deep zoom assets for a container.
 
         Args:
             container_name (str): The name of the container.
             content (dict): The content dictionary containing the image array.
+            dataset_version_id (str): The UUID uniquely identifying the dataset version.
         """
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
@@ -142,7 +145,7 @@ def create_deep_zoom_assets(self, container_name, content):
                 image_array, _ = self._fetch_image(content)
                 processed_image = self._process_and_flip_image(image_array)
                 self._generate_deep_zoom_assets(processed_image, assets_folder)
-                self._upload_assets(assets_folder)
+                self._upload_assets(assets_folder, dataset_version_id)
         except Exception as e:
             logger.exception(f"Failed to create and upload deep zoom assets: {e}")
             raise
diff --git a/tests/unit/processing/test_h5ad_data_file.py b/tests/unit/processing/test_h5ad_data_file.py
index fdb1a4967fd40..7fbeb0e7bcceb 100644
--- a/tests/unit/processing/test_h5ad_data_file.py
+++ b/tests/unit/processing/test_h5ad_data_file.py
@@ -20,6 +20,7 @@ def setUp(self):
         self.sample_h5ad_filename = self._write_anndata_to_file(self.sample_anndata)
 
         self.sample_output_directory = path.splitext(self.sample_h5ad_filename)[0] + ".cxg"
+        self.dataset_version_id = "test_dataset_version_id"
 
     def tearDown(self):
         if self.sample_h5ad_filename:
@@ -108,31 +109,31 @@ def test__create_h5ad_data_file__obs_and_var_index_names_specified_doesnt_exist_
 
     def test__to_cxg__simple_anndata_no_corpora_and_sparse(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename)
-        h5ad_file.to_cxg(self.sample_output_directory, 100)
+        h5ad_file.to_cxg(self.sample_output_directory, 100, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, True)
 
     def test__to_cxg__simple_anndata_with_corpora_and_sparse(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename)
-        h5ad_file.to_cxg(self.sample_output_directory, 100)
+        h5ad_file.to_cxg(self.sample_output_directory, 100, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, True)
 
     def test__to_cxg__simple_anndata_no_corpora_and_dense(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename)
-        h5ad_file.to_cxg(self.sample_output_directory, 0)
+        h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False)
 
     def test__to_cxg__simple_anndata_with_corpora_and_dense(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename)
-        h5ad_file.to_cxg(self.sample_output_directory, 0)
+        h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False)
 
     def test__to_cxg__simple_anndata_with_corpora_and_dense_using_feature_name_var_index(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename, var_index_column_name="feature_name")
-        h5ad_file.to_cxg(self.sample_output_directory, 0)
+        h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False)
         self._validate_cxg_var_index_column_match(
@@ -142,7 +143,7 @@ def test__to_cxg__simple_anndata_with_corpora_and_dense_using_feature_name_var_i
 
     def test__to_cxg__simple_anndata_with_different_var_index_than_h5ad(self):
         h5ad_file = H5ADDataFile(self.sample_h5ad_filename, var_index_column_name="int_category")
-        h5ad_file.to_cxg(self.sample_output_directory, 0)
+        h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id)
 
         self._validate_cxg_var_index_column_match(
             self.sample_output_directory,
@@ -155,7 +156,7 @@ def test__to_cxg__with_sparse_column_encoding(self):
         sparse_with_column_shift_filename = self._write_anndata_to_file(anndata)
 
         h5ad_file = H5ADDataFile(sparse_with_column_shift_filename)
-        h5ad_file.to_cxg(self.sample_output_directory, 50)
+        h5ad_file.to_cxg(self.sample_output_directory, 50, self.dataset_version_id)
 
         self._validate_cxg_and_h5ad_content_match(
             sparse_with_column_shift_filename, self.sample_output_directory, False, has_column_encoding=True