From fa7adb0280f4a9a47a0ede5b91e6c3bfc1c73a65 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Thu, 23 May 2024 15:32:54 -0700 Subject: [PATCH 01/11] update requirements to use new cellxgene-schema 5.1 --- python_dependencies/processing/requirements.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python_dependencies/processing/requirements.txt b/python_dependencies/processing/requirements.txt index 5396acfd9dc69..31e8a22bf0f5b 100644 --- a/python_dependencies/processing/requirements.txt +++ b/python_dependencies/processing/requirements.txt @@ -2,12 +2,13 @@ anndata==0.8.0 awscli boto3>=1.11.17 botocore>=1.14.17 -cellxgene-schema +# install the cellxgene-schema version from latest commit on the main branch from github +git+https://github.com/chanzuckerberg/single-cell-curation.git@main#egg=cellxgene-schema&subdirectory=cellxgene_schema_cli dataclasses-json ddtrace==2.1.4 -numba==0.56.2 -numpy==1.23.2 -pandas==1.4.4 +numba +numpy +pandas psutil>=5.9.0 psycopg2-binary==2.* pyarrow>=1.0 @@ -18,7 +19,7 @@ requests>=2.22.0 rpy2==3.5.16 rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability s3fs==0.4.2 -scanpy==1.9.3 +scanpy SQLAlchemy==2.* tenacity tiledb==0.25.0 # Portal's tiledb version should always be the same or older than Explorer's From cd4754f3a31201e205ae3338e646527475d7b9a9 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Thu, 23 May 2024 16:32:23 -0700 Subject: [PATCH 02/11] fix tests --- tests/unit/processing/test_h5ad_data_file.py | 4 ++-- tests/unit/processing/test_spatial_assets_utils.py | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/processing/test_h5ad_data_file.py b/tests/unit/processing/test_h5ad_data_file.py index 69f13aff4333b..fdb1a4967fd40 100644 --- a/tests/unit/processing/test_h5ad_data_file.py +++ b/tests/unit/processing/test_h5ad_data_file.py @@ -170,7 +170,7 @@ def test__slash_in_attribute_name(self): col_name = "fo/o" - attrs = [tiledb.Attr(name=col_name, dtype=np.int)] + attrs = [tiledb.Attr(name=col_name, dtype=int)] domain = tiledb.Domain(tiledb.Dim(domain=(0, 99), tile=100, dtype=np.uint32)) schema = tiledb.ArraySchema( domain=domain, sparse=False, attrs=attrs, cell_order="row-major", tile_order="row-major" @@ -180,7 +180,7 @@ def test__slash_in_attribute_name(self): try: with tiledb.open("foo", mode="w") as A: value = dict() - value[col_name] = np.zeros((100,), dtype=np.int) + value[col_name] = np.zeros((100,), dtype=int) A[:] = value # if there's a regression, this statement will throw a TileDBError # if we get here we're good finally: diff --git a/tests/unit/processing/test_spatial_assets_utils.py b/tests/unit/processing/test_spatial_assets_utils.py index c74a8c2cf0e0d..7bc0a9f960bc6 100644 --- a/tests/unit/processing/test_spatial_assets_utils.py +++ b/tests/unit/processing/test_spatial_assets_utils.py @@ -236,7 +236,7 @@ def test__upload_assets_failure(spatial_processor, output_folder, mocker): mock_upload.assert_called_once_with(output_folder, expected_s3_uri) -def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatial_data, mocker): +def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatial_data, mocker, tmpdir): mock_fetch_image = mocker.patch.object(spatial_processor, "_fetch_image") mock_process_and_flip_image = mocker.patch.object(spatial_processor, "_process_and_flip_image") mock_generate_deep_zoom_assets = mocker.patch.object(spatial_processor, "_generate_deep_zoom_assets") @@ -244,8 +244,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia # mock the TemporaryDirectory context manager mock_temp_dir = mocker.patch("tempfile.TemporaryDirectory") - temp_dir_name = "/mock/temp/dir" - mock_temp_dir.return_value.__enter__.return_value = temp_dir_name + mock_temp_dir.return_value.__enter__.return_value = tmpdir # mock return values for the internal methods mock_fetch_image.return_value = np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8) @@ -254,7 +253,7 @@ def test__create_deep_zoom_assets(spatial_processor, cxg_container, valid_spatia # call the method under test spatial_processor.create_deep_zoom_assets(cxg_container, valid_spatial_data) - assets_folder = os.path.join(temp_dir_name, cxg_container.replace(".cxg", "")) + assets_folder = os.path.join(tmpdir, cxg_container.replace(".cxg", "")) # assertions to ensure each step is called mock_fetch_image.assert_called_once_with(valid_spatial_data) From a00aad2ea8c89a79059ad09fcc146e03f5702765 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Tue, 28 May 2024 16:33:07 -0700 Subject: [PATCH 03/11] pass can_publish using s3 --- backend/layers/processing/schema_migration.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 68afe158f5c7a..5d9c7f95d2399 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -174,7 +174,6 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can # ^^^ The top level fields are used for handling error cases in the AWS SFN. "datasets": [ { - "can_publish": str(can_publish), "collection_id": collection_id, "collection_url": collection_url, "collection_version_id": private_collection_version_id, @@ -196,7 +195,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can self._store_sfn_response("publish_and_cleanup", version.collection_id.id, response) return response - def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) -> list: + def publish_and_cleanup(self, collection_version_id: str) -> list: errors = [] collection_version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id)) object_keys_to_delete = [] @@ -259,7 +258,7 @@ def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) -> self.s3_provider.delete_files(self.artifact_bucket, object_keys_to_delete) if errors: self._store_sfn_response("report/errors", collection_version_id, errors) - elif can_publish: + elif extra_info["can_publish"] == "true": self.business_logic.publish_collection_version(collection_version.version_id) return errors @@ -381,9 +380,8 @@ def migrate(self, step_name) -> bool: ) elif step_name == "collection_publish": collection_version_id = os.environ["COLLECTION_VERSION_ID"] - can_publish = os.environ["CAN_PUBLISH"].lower() == "true" publish_and_cleanup = self.error_wrapper(self.publish_and_cleanup, collection_version_id) - response = publish_and_cleanup(collection_version_id=collection_version_id, can_publish=can_publish) + response = publish_and_cleanup(collection_version_id=collection_version_id) elif step_name == "report": response = self.report() self.logger.info("output", extra={"response": response}) From 98a2ecf03b4ebc0cb451f025e04479ed8889e5ab Mon Sep 17 00:00:00 2001 From: Bento007 Date: Tue, 28 May 2024 16:33:23 -0700 Subject: [PATCH 04/11] pass can_publish using s3 --- .happy/terraform/modules/schema_migration/main.tf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf index 07025411f412c..f1ce7d083cc4e 100644 --- a/.happy/terraform/modules/schema_migration/main.tf +++ b/.happy/terraform/modules/schema_migration/main.tf @@ -353,10 +353,6 @@ resource aws_sfn_state_machine sfn_schema_migration { "Name": "COLLECTION_VERSION_ID", "Value.$": "$.collection_version_id" }, - { - "Name": "CAN_PUBLISH", - "Value.$": "$.can_publish" - }, { "Name": "TASK_TOKEN", "Value.$": "$$.Task.Token" From 5f21af208698a60b1c04074b39a99718ffa53d58 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Tue, 28 May 2024 16:39:40 -0700 Subject: [PATCH 05/11] don't pass the error to future steps --- .happy/terraform/modules/schema_migration/main.tf | 10 +++++----- backend/layers/processing/schema_migration.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf index f1ce7d083cc4e..bdee78f14768c 100644 --- a/.happy/terraform/modules/schema_migration/main.tf +++ b/.happy/terraform/modules/schema_migration/main.tf @@ -313,7 +313,7 @@ resource aws_sfn_state_machine sfn_schema_migration { "ErrorEquals": [ "States.ALL" ], - "ResultPath": "$.error", + "ResultPath": null, "Next": "CollectionPublish" } ] @@ -377,7 +377,7 @@ resource aws_sfn_state_machine sfn_schema_migration { "States.ALL" ], "Next": "CollectionError", - "ResultPath": "$.error" + "ResultPath": null } ] }, @@ -450,7 +450,7 @@ resource aws_sfn_state_machine sfn_schema_migration { "States.ALL" ], "Next": "DatasetError", - "ResultPath": "$.error" + "ResultPath": null } ], "ResultPath": "$.result" @@ -480,7 +480,7 @@ resource aws_sfn_state_machine sfn_schema_migration { "States.ALL" ], "Next": "DatasetError", - "ResultPath": "$.error" + "ResultPath": null } ], "ResultPath": "$.result" @@ -496,7 +496,7 @@ resource aws_sfn_state_machine sfn_schema_migration { "States.ALL" ], "Next": "CollectionPublish", - "ResultPath": "$.error" + "ResultPath": null } ], "OutputPath": "$[0]", diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 5d9c7f95d2399..0c823e7764e56 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -66,7 +66,8 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]: _resp = {} if collection.is_published() and collection.collection_id.id in has_revision: continue - + if collection.collection_id.id not in ["0aab20b3-c30c-4606-bd2e-d20dae739c45"]: + continue if collection.is_published(): # published collection without an active revision _resp["can_publish"] = str(True) From a126899c65488b3a9f5831dbfe23ab129f71a674 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Tue, 28 May 2024 17:00:23 -0700 Subject: [PATCH 06/11] undo can_publish --- .happy/terraform/modules/schema_migration/main.tf | 4 ++++ backend/layers/processing/schema_migration.py | 8 +++++--- .../schema_migration/test_publish_and_cleanup.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.happy/terraform/modules/schema_migration/main.tf b/.happy/terraform/modules/schema_migration/main.tf index bdee78f14768c..1bc5f4e06eef6 100644 --- a/.happy/terraform/modules/schema_migration/main.tf +++ b/.happy/terraform/modules/schema_migration/main.tf @@ -353,6 +353,10 @@ resource aws_sfn_state_machine sfn_schema_migration { "Name": "COLLECTION_VERSION_ID", "Value.$": "$.collection_version_id" }, + { + "Name": "CAN_PUBLISH", + "Value.$": "$.can_publish" + }, { "Name": "TASK_TOKEN", "Value.$": "$$.Task.Token" diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 0c823e7764e56..fb279464c3a41 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -175,6 +175,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can # ^^^ The top level fields are used for handling error cases in the AWS SFN. "datasets": [ { + "can_publish": str(can_publish), "collection_id": collection_id, "collection_url": collection_url, "collection_version_id": private_collection_version_id, @@ -196,7 +197,7 @@ def collection_migrate(self, collection_id: str, collection_version_id: str, can self._store_sfn_response("publish_and_cleanup", version.collection_id.id, response) return response - def publish_and_cleanup(self, collection_version_id: str) -> list: + def publish_and_cleanup(self, collection_version_id: str, can_publish: bool) -> list: errors = [] collection_version = self.business_logic.get_collection_version(CollectionVersionId(collection_version_id)) object_keys_to_delete = [] @@ -259,7 +260,7 @@ def publish_and_cleanup(self, collection_version_id: str) -> list: self.s3_provider.delete_files(self.artifact_bucket, object_keys_to_delete) if errors: self._store_sfn_response("report/errors", collection_version_id, errors) - elif extra_info["can_publish"] == "true": + elif can_publish: self.business_logic.publish_collection_version(collection_version.version_id) return errors @@ -381,8 +382,9 @@ def migrate(self, step_name) -> bool: ) elif step_name == "collection_publish": collection_version_id = os.environ["COLLECTION_VERSION_ID"] + can_publish = os.environ["CAN_PUBLISH"].lower() == "true" publish_and_cleanup = self.error_wrapper(self.publish_and_cleanup, collection_version_id) - response = publish_and_cleanup(collection_version_id=collection_version_id) + response = publish_and_cleanup(collection_version_id=collection_version_id, can_publish=can_publish) elif step_name == "report": response = self.report() self.logger.info("output", extra={"response": response}) diff --git a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py b/tests/unit/processing/schema_migration/test_publish_and_cleanup.py index bc2ef2c0fe2ca..06cc3f347412f 100644 --- a/tests/unit/processing/schema_migration/test_publish_and_cleanup.py +++ b/tests/unit/processing/schema_migration/test_publish_and_cleanup.py @@ -135,7 +135,7 @@ def test_can_not_publish(self, mock_json, local_schema_migrate): ] ) local_schema_migrate.business_logic.get_collection_version.return_value = collection_version - errors = local_schema_migrate.publish_and_cleanup(collection_version.version_id.id, False) + errors = local_schema_migrate.publish_and_cleanup(collection_version.version_id.id) assert errors == [] local_schema_migrate.business_logic.publish_collection_version.assert_not_called() local_schema_migrate.s3_provider.delete_files.assert_any_call( From 0dca596cf1169a152cb697db249ae1591d6ade24 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Wed, 29 May 2024 11:22:32 -0400 Subject: [PATCH 07/11] filter by spatial collection id --- backend/layers/processing/schema_migration.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 68afe158f5c7a..355f9d4173bf3 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -62,11 +62,34 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]: has_revision = set() # iterates over unpublished collections first, so published versions are skipped if there is an active revision + + # temp: filter for Collections containing visium and slide-seq-v2 datasets + COLLECTIONS = [ + "e68d20e4-5c51-4f6c-b599-30a08858c5ce", + "1dbbd47f-80be-4048-8516-c9fa50cb4890", + "d1c3e6e4-950e-4a5f-83b4-8de13153299a", + "362505ba-aff6-4589-a407-667a9e18bf8a", + "8dac14c9-e100-45c7-8ace-051ac2b7737a", + "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", + "8fcfd5ec-e671-4836-9a96-52110e11db32", + "790b6cbd-2cec-4782-8173-08d53e6afc43", + "e0612e2e-0867-4773-bff9-b5e08b2a77e7", + "d74b6979-efba-47cd-990a-9d80ccf29055", + "02b01703-bf1b-48de-b99a-23bef8cccc81", + "8e880741-bf9a-4c8e-9227-934204631d2a", + "a96133de-e951-4e2d-ace6-59db8b3bfb1d", + "21bbfaec-6958-46bc-b1cd-1535752f6304", + "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", + "7c1fbbae-5f69-4e3e-950d-d819466aecb2", + "0aab20b3-c30c-4606-bd2e-d20dae739c45", + ] + for collection in self.fetch_collections(): _resp = {} if collection.is_published() and collection.collection_id.id in has_revision: continue - + if collection.collection_id.id not in COLLECTIONS: + continue if collection.is_published(): # published collection without an active revision _resp["can_publish"] = str(True) From c411aed30071e7403ce9538fbafdcd1c22600c79 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Wed, 29 May 2024 13:15:30 -0400 Subject: [PATCH 08/11] filter by spatial collection id --- backend/layers/processing/schema_migration.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 355f9d4173bf3..2fdaf19056bd3 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -66,22 +66,22 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]: # temp: filter for Collections containing visium and slide-seq-v2 datasets COLLECTIONS = [ "e68d20e4-5c51-4f6c-b599-30a08858c5ce", - "1dbbd47f-80be-4048-8516-c9fa50cb4890", - "d1c3e6e4-950e-4a5f-83b4-8de13153299a", - "362505ba-aff6-4589-a407-667a9e18bf8a", - "8dac14c9-e100-45c7-8ace-051ac2b7737a", - "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", - "8fcfd5ec-e671-4836-9a96-52110e11db32", - "790b6cbd-2cec-4782-8173-08d53e6afc43", - "e0612e2e-0867-4773-bff9-b5e08b2a77e7", - "d74b6979-efba-47cd-990a-9d80ccf29055", - "02b01703-bf1b-48de-b99a-23bef8cccc81", - "8e880741-bf9a-4c8e-9227-934204631d2a", - "a96133de-e951-4e2d-ace6-59db8b3bfb1d", - "21bbfaec-6958-46bc-b1cd-1535752f6304", - "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", - "7c1fbbae-5f69-4e3e-950d-d819466aecb2", - "0aab20b3-c30c-4606-bd2e-d20dae739c45", + # "1dbbd47f-80be-4048-8516-c9fa50cb4890", + # "d1c3e6e4-950e-4a5f-83b4-8de13153299a", + # "362505ba-aff6-4589-a407-667a9e18bf8a", + # "8dac14c9-e100-45c7-8ace-051ac2b7737a", + # "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", + # "8fcfd5ec-e671-4836-9a96-52110e11db32", + # "790b6cbd-2cec-4782-8173-08d53e6afc43", + # "e0612e2e-0867-4773-bff9-b5e08b2a77e7", + # "d74b6979-efba-47cd-990a-9d80ccf29055", + # "02b01703-bf1b-48de-b99a-23bef8cccc81", + # "8e880741-bf9a-4c8e-9227-934204631d2a", + # "a96133de-e951-4e2d-ace6-59db8b3bfb1d", + # "21bbfaec-6958-46bc-b1cd-1535752f6304", + # "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", + # "7c1fbbae-5f69-4e3e-950d-d819466aecb2", + # "0aab20b3-c30c-4606-bd2e-d20dae739c45", ] for collection in self.fetch_collections(): From 5d00c6fe1c19bfd27129cce7e6031ef2afa591ff Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Wed, 29 May 2024 13:16:46 -0400 Subject: [PATCH 09/11] Revert "filter by spatial collection id" This reverts commit c411aed30071e7403ce9538fbafdcd1c22600c79. --- backend/layers/processing/schema_migration.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 2fdaf19056bd3..355f9d4173bf3 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -66,22 +66,22 @@ def gather_collections(self, auto_publish: bool) -> List[Dict[str, str]]: # temp: filter for Collections containing visium and slide-seq-v2 datasets COLLECTIONS = [ "e68d20e4-5c51-4f6c-b599-30a08858c5ce", - # "1dbbd47f-80be-4048-8516-c9fa50cb4890", - # "d1c3e6e4-950e-4a5f-83b4-8de13153299a", - # "362505ba-aff6-4589-a407-667a9e18bf8a", - # "8dac14c9-e100-45c7-8ace-051ac2b7737a", - # "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", - # "8fcfd5ec-e671-4836-9a96-52110e11db32", - # "790b6cbd-2cec-4782-8173-08d53e6afc43", - # "e0612e2e-0867-4773-bff9-b5e08b2a77e7", - # "d74b6979-efba-47cd-990a-9d80ccf29055", - # "02b01703-bf1b-48de-b99a-23bef8cccc81", - # "8e880741-bf9a-4c8e-9227-934204631d2a", - # "a96133de-e951-4e2d-ace6-59db8b3bfb1d", - # "21bbfaec-6958-46bc-b1cd-1535752f6304", - # "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", - # "7c1fbbae-5f69-4e3e-950d-d819466aecb2", - # "0aab20b3-c30c-4606-bd2e-d20dae739c45", + "1dbbd47f-80be-4048-8516-c9fa50cb4890", + "d1c3e6e4-950e-4a5f-83b4-8de13153299a", + "362505ba-aff6-4589-a407-667a9e18bf8a", + "8dac14c9-e100-45c7-8ace-051ac2b7737a", + "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", + "8fcfd5ec-e671-4836-9a96-52110e11db32", + "790b6cbd-2cec-4782-8173-08d53e6afc43", + "e0612e2e-0867-4773-bff9-b5e08b2a77e7", + "d74b6979-efba-47cd-990a-9d80ccf29055", + "02b01703-bf1b-48de-b99a-23bef8cccc81", + "8e880741-bf9a-4c8e-9227-934204631d2a", + "a96133de-e951-4e2d-ace6-59db8b3bfb1d", + "21bbfaec-6958-46bc-b1cd-1535752f6304", + "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", + "7c1fbbae-5f69-4e3e-950d-d819466aecb2", + "0aab20b3-c30c-4606-bd2e-d20dae739c45", ] for collection in self.fetch_collections(): From 43bf26f7c05d0f236e178794714a7607d29c8b13 Mon Sep 17 00:00:00 2001 From: Bento007 Date: Fri, 31 May 2024 10:36:06 -0700 Subject: [PATCH 10/11] add comments --- backend/layers/processing/schema_migration.py | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/backend/layers/processing/schema_migration.py b/backend/layers/processing/schema_migration.py index 5d7d0b57e0cbb..c888a0538dec9 100644 --- a/backend/layers/processing/schema_migration.py +++ b/backend/layers/processing/schema_migration.py @@ -66,32 +66,34 @@ def gather_collections(self, auto_publish: bool) -> Tuple[Dict[str, str], Dict[s # iterates over unpublished collections first, so published versions are skipped if there is an active revision # temp: filter for Collections containing visium and slide-seq-v2 datasets - COLLECTIONS = [ - "e68d20e4-5c51-4f6c-b599-30a08858c5ce", - "1dbbd47f-80be-4048-8516-c9fa50cb4890", - "d1c3e6e4-950e-4a5f-83b4-8de13153299a", - "362505ba-aff6-4589-a407-667a9e18bf8a", - "8dac14c9-e100-45c7-8ace-051ac2b7737a", - "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", - "8fcfd5ec-e671-4836-9a96-52110e11db32", - "790b6cbd-2cec-4782-8173-08d53e6afc43", - "e0612e2e-0867-4773-bff9-b5e08b2a77e7", - "d74b6979-efba-47cd-990a-9d80ccf29055", - "02b01703-bf1b-48de-b99a-23bef8cccc81", - "8e880741-bf9a-4c8e-9227-934204631d2a", - "a96133de-e951-4e2d-ace6-59db8b3bfb1d", - "21bbfaec-6958-46bc-b1cd-1535752f6304", - "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", - "7c1fbbae-5f69-4e3e-950d-d819466aecb2", - "0aab20b3-c30c-4606-bd2e-d20dae739c45", - ] + # TODO remove commented sectection bellow + # COLLECTIONS = [ + # "e68d20e4-5c51-4f6c-b599-30a08858c5ce", + # "1dbbd47f-80be-4048-8516-c9fa50cb4890", + # "d1c3e6e4-950e-4a5f-83b4-8de13153299a", + # "362505ba-aff6-4589-a407-667a9e18bf8a", + # "8dac14c9-e100-45c7-8ace-051ac2b7737a", + # "19b54e58-8e71-497b-ad03-aa9a6ca52ce0", + # "8fcfd5ec-e671-4836-9a96-52110e11db32", + # "790b6cbd-2cec-4782-8173-08d53e6afc43", + # "e0612e2e-0867-4773-bff9-b5e08b2a77e7", + # "d74b6979-efba-47cd-990a-9d80ccf29055", + # "02b01703-bf1b-48de-b99a-23bef8cccc81", + # "8e880741-bf9a-4c8e-9227-934204631d2a", + # "a96133de-e951-4e2d-ace6-59db8b3bfb1d", + # "21bbfaec-6958-46bc-b1cd-1535752f6304", + # "2d69ef57-864d-4f58-93f5-4fbd6fab06a6", + # "7c1fbbae-5f69-4e3e-950d-d819466aecb2", + # "0aab20b3-c30c-4606-bd2e-d20dae739c45", + # ] for collection in self.fetch_collections(): _resp = {} if collection.is_published() and collection.collection_id.id in has_revision: continue - if collection.collection_id.id not in COLLECTIONS: - continue + # if collection.collection_id.id not in COLLECTIONS: + # continue + # TODO: remove commented section if collection.is_published(): # published collection without an active revision _resp["can_publish"] = str(True) From a114eeee8f67e722444ba4121f8669dbc028c9fd Mon Sep 17 00:00:00 2001 From: Bento007 Date: Fri, 31 May 2024 11:46:18 -0700 Subject: [PATCH 11/11] Fix tests --- backend/layers/processing/h5ad_data_file.py | 6 ++++-- backend/layers/processing/process_cxg.py | 4 ++-- backend/layers/processing/process_logic.py | 2 +- backend/layers/processing/process_seurat.py | 2 +- .../processing/utils/cxg_generation_utils.py | 4 ++-- backend/layers/processing/utils/spatial.py | 11 +++++++---- tests/unit/processing/test_h5ad_data_file.py | 15 ++++++++------- 7 files changed, 25 insertions(+), 19 deletions(-) diff --git a/backend/layers/processing/h5ad_data_file.py b/backend/layers/processing/h5ad_data_file.py index c060c107cabec..50424d32199fe 100644 --- a/backend/layers/processing/h5ad_data_file.py +++ b/backend/layers/processing/h5ad_data_file.py @@ -54,7 +54,9 @@ def __init__( self.validate_anndata() - def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_to_cxg_colors=True): + def to_cxg( + self, output_cxg_directory, sparse_threshold, dataset_version_id, convert_anndata_colors_to_cxg_colors=True + ): """ Writes the following attributes of the anndata to CXG: 1) the metadata as metadata attached to an empty DenseArray, 2) the obs DataFrame as a DenseArray, 3) the var DataFrame as a DenseArray, 4) all valid @@ -80,7 +82,7 @@ def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_ convert_dataframe_to_cxg_array(output_cxg_directory, "var", self.var, self.var_index_column_name, ctx) logging.info("\t...dataset var dataframe saved") - convert_uns_to_cxg_group(output_cxg_directory, self.anndata.uns, "uns", ctx) + convert_uns_to_cxg_group(output_cxg_directory, self.anndata.uns, dataset_version_id, "uns", ctx) logging.info("\t...dataset uns dataframe saved") self.write_anndata_embeddings_to_cxg(output_cxg_directory, ctx) diff --git a/backend/layers/processing/process_cxg.py b/backend/layers/processing/process_cxg.py index 864cc4e470dc4..64c2946153fcf 100644 --- a/backend/layers/processing/process_cxg.py +++ b/backend/layers/processing/process_cxg.py @@ -67,7 +67,7 @@ def process( self.process_cxg(labeled_h5ad_filename, dataset_version_id, cellxgene_bucket, current_artifacts) @logit - def make_cxg(self, local_filename): + def make_cxg(self, local_filename, dataset_version_id): """ Convert the uploaded H5AD file to the CXG format servicing the cellxgene Explorer. """ @@ -75,7 +75,7 @@ def make_cxg(self, local_filename): cxg_output_container = local_filename.replace(".h5ad", ".cxg") try: h5ad_data_file = H5ADDataFile(local_filename, var_index_column_name="feature_name") - h5ad_data_file.to_cxg(cxg_output_container, sparse_threshold=25.0) + h5ad_data_file.to_cxg(cxg_output_container, sparse_threshold=25.0, dataset_version_id=dataset_version_id.id) except Exception as ex: # TODO use a specialized exception msg = "CXG conversion failed." diff --git a/backend/layers/processing/process_logic.py b/backend/layers/processing/process_logic.py index c85541510b6b4..338948bbf6681 100644 --- a/backend/layers/processing/process_logic.py +++ b/backend/layers/processing/process_logic.py @@ -114,7 +114,7 @@ def convert_file( start = datetime.now() try: self.update_processing_status(dataset_version_id, processing_status_key, DatasetConversionStatus.CONVERTING) - file_dir = converter(local_filename) + file_dir = converter(local_filename, dataset_version_id) self.update_processing_status(dataset_version_id, processing_status_key, DatasetConversionStatus.CONVERTED) self.logger.info(f"Finished converting {converter} in {datetime.now() - start}") except Exception: diff --git a/backend/layers/processing/process_seurat.py b/backend/layers/processing/process_seurat.py index 7a7b31a776a57..3156fdb8de652 100644 --- a/backend/layers/processing/process_seurat.py +++ b/backend/layers/processing/process_seurat.py @@ -96,7 +96,7 @@ def process(self, dataset_version_id: DatasetVersionId, artifact_bucket: str, da ) @logit - def make_seurat(self, local_filename): + def make_seurat(self, local_filename, *args, **kwargs): """ Create a Seurat rds file from the AnnData file. """ diff --git a/backend/layers/processing/utils/cxg_generation_utils.py b/backend/layers/processing/utils/cxg_generation_utils.py index 3d46f0cda13e8..78a104efa4f5c 100644 --- a/backend/layers/processing/utils/cxg_generation_utils.py +++ b/backend/layers/processing/utils/cxg_generation_utils.py @@ -34,7 +34,7 @@ def convert_dictionary_to_cxg_group(cxg_container, metadata_dict, group_metadata metadata_array.meta[key] = value -def convert_uns_to_cxg_group(cxg_container, metadata_dict, group_metadata_name="uns", ctx=None): +def convert_uns_to_cxg_group(cxg_container, metadata_dict, dataset_version_id, group_metadata_name="uns", ctx=None): """ Convert uns (unstructured) metadata to CXG output directory specified Generate deep zoom assets for spatial data @@ -53,7 +53,7 @@ def convert_uns_to_cxg_group(cxg_container, metadata_dict, group_metadata_name=" for object_id, content in value.items(): if object_id not in SPATIAL_KEYS_EXCLUDE: object_filtered = spatial_processor.filter_spatial_data(content, object_id) - spatial_processor.create_deep_zoom_assets(cxg_container, content) + spatial_processor.create_deep_zoom_assets(cxg_container, content, dataset_version_id) metadata_array.meta[key] = pickle.dumps(object_filtered) diff --git a/backend/layers/processing/utils/spatial.py b/backend/layers/processing/utils/spatial.py index 2acef734bb1bb..6e29c73db9aaa 100644 --- a/backend/layers/processing/utils/spatial.py +++ b/backend/layers/processing/utils/spatial.py @@ -116,23 +116,26 @@ def _generate_deep_zoom_assets(self, image_array, assets_folder): image = pyvips.Image.new_from_memory(linear.data, w, h, bands, "uchar") image.dzsave(os.path.join(assets_folder, "spatial"), suffix=".jpeg") - def _upload_assets(self, assets_folder): + def _upload_assets(self, assets_folder, dataset_version_id): """ Upload the deep zoom assets to the S3 bucket. Args: assets_folder (str): The folder containing the assets. + dataset_version_id (str): The UUID uniquely identifying the dataset version. """ - s3_uri = f"s3://{self.bucket_name}/{self.asset_directory}/{os.path.basename(assets_folder)}" + version_id = dataset_version_id.replace(".cxg", "") + s3_uri = f"s3://{self.bucket_name}/{self.asset_directory}/{version_id}" self.s3_provider.upload_directory(assets_folder, s3_uri) - def create_deep_zoom_assets(self, container_name, content): + def create_deep_zoom_assets(self, container_name, content, dataset_version_id): """ Create deep zoom assets for a container. Args: container_name (str): The name of the container. content (dict): The content dictionary containing the image array. + dataset_version_id (str): The UUID uniquely identifying the dataset version. """ try: with tempfile.TemporaryDirectory() as temp_dir: @@ -142,7 +145,7 @@ def create_deep_zoom_assets(self, container_name, content): image_array, _ = self._fetch_image(content) processed_image = self._process_and_flip_image(image_array) self._generate_deep_zoom_assets(processed_image, assets_folder) - self._upload_assets(assets_folder) + self._upload_assets(assets_folder, dataset_version_id) except Exception as e: logger.exception(f"Failed to create and upload deep zoom assets: {e}") raise diff --git a/tests/unit/processing/test_h5ad_data_file.py b/tests/unit/processing/test_h5ad_data_file.py index fdb1a4967fd40..7fbeb0e7bcceb 100644 --- a/tests/unit/processing/test_h5ad_data_file.py +++ b/tests/unit/processing/test_h5ad_data_file.py @@ -20,6 +20,7 @@ def setUp(self): self.sample_h5ad_filename = self._write_anndata_to_file(self.sample_anndata) self.sample_output_directory = path.splitext(self.sample_h5ad_filename)[0] + ".cxg" + self.dataset_version_id = "test_dataset_version_id" def tearDown(self): if self.sample_h5ad_filename: @@ -108,31 +109,31 @@ def test__create_h5ad_data_file__obs_and_var_index_names_specified_doesnt_exist_ def test__to_cxg__simple_anndata_no_corpora_and_sparse(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) - h5ad_file.to_cxg(self.sample_output_directory, 100) + h5ad_file.to_cxg(self.sample_output_directory, 100, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, True) def test__to_cxg__simple_anndata_with_corpora_and_sparse(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) - h5ad_file.to_cxg(self.sample_output_directory, 100) + h5ad_file.to_cxg(self.sample_output_directory, 100, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, True) def test__to_cxg__simple_anndata_no_corpora_and_dense(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) - h5ad_file.to_cxg(self.sample_output_directory, 0) + h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False) def test__to_cxg__simple_anndata_with_corpora_and_dense(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) - h5ad_file.to_cxg(self.sample_output_directory, 0) + h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False) def test__to_cxg__simple_anndata_with_corpora_and_dense_using_feature_name_var_index(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, var_index_column_name="feature_name") - h5ad_file.to_cxg(self.sample_output_directory, 0) + h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match(self.sample_h5ad_filename, self.sample_output_directory, False) self._validate_cxg_var_index_column_match( @@ -142,7 +143,7 @@ def test__to_cxg__simple_anndata_with_corpora_and_dense_using_feature_name_var_i def test__to_cxg__simple_anndata_with_different_var_index_than_h5ad(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, var_index_column_name="int_category") - h5ad_file.to_cxg(self.sample_output_directory, 0) + h5ad_file.to_cxg(self.sample_output_directory, 0, self.dataset_version_id) self._validate_cxg_var_index_column_match( self.sample_output_directory, @@ -155,7 +156,7 @@ def test__to_cxg__with_sparse_column_encoding(self): sparse_with_column_shift_filename = self._write_anndata_to_file(anndata) h5ad_file = H5ADDataFile(sparse_with_column_shift_filename) - h5ad_file.to_cxg(self.sample_output_directory, 50) + h5ad_file.to_cxg(self.sample_output_directory, 50, self.dataset_version_id) self._validate_cxg_and_h5ad_content_match( sparse_with_column_shift_filename, self.sample_output_directory, False, has_column_encoding=True