Merging staging branch into prod branch

chanzuckerberg · May 16, 2024 · 1f5fa6b · 1f5fa6b
2 parents 43ea7ab + d149bc5
commit 1f5fa6b
Show file tree

Hide file tree

Showing 15 changed files with 92 additions and 43 deletions.
diff --git a/.github/workflows/rebuild-processing-image.yml b/.github/workflows/rebuild-processing-image.yml
@@ -49,6 +49,8 @@ jobs:
         shell: bash
         run: |
           export BRANCH_SHA=$(git rev-parse --short=8 HEAD)
+          # Cache busting to ensure the image is always rebuilt.
+          export CACHEBUST=$(date +%s)
           happy push "" --aws-profile "" --tag sha-${BRANCH_SHA} --slice processing
       - name: Alert in Slack
         uses: 8398a7/action-slack@v3

diff --git a/Dockerfile.cellguide_pipeline b/Dockerfile.cellguide_pipeline
@@ -21,6 +21,7 @@ ADD backend/wmg/config.py backend/wmg/config.py
 ADD backend/wmg/data backend/wmg/data
 ADD backend/wmg/api backend/wmg/api
 ADD backend/cellguide/pipeline backend/cellguide/pipeline
+ADD backend/cellguide/common backend/cellguide/common
 ADD backend/common backend/common
 
 ARG HAPPY_BRANCH="unknown"

diff --git a/Dockerfile.processing b/Dockerfile.processing
@@ -22,7 +22,10 @@ ENV PATH="/opt/venv/bin:$PATH"
 COPY /python_dependencies/processing/ .
 COPY /python_dependencies/common/ .
 ARG INSTALL_DEV=false
+
+ARG CACHEBUST=1
 RUN python3.10 -m pip install -r requirements.txt
+
 RUN if [ "$INSTALL_DEV" = "true" ]; then python3 -m pip install -r requirements-dev.txt; fi
 
 ADD backend/__init__.py backend/__init__.py

diff --git a/backend/cellguide/common/constants.py b/backend/cellguide/common/constants.py
@@ -12,3 +12,4 @@
 GPT_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_descriptions"
 GPT_SEO_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_seo_descriptions"
 MARKER_GENE_PRESENCE_FILENAME = "marker_gene_presence.json.gz"
+MARKER_GENE_DATA_FILENAME = "marker_gene_data.json.gz"
diff --git a/backend/cellguide/api/common/data.py → backend/cellguide/common/data.py b/backend/cellguide/api/common/data.py → backend/cellguide/common/data.py
@@ -8,6 +8,8 @@
 from backend.cellguide.common.providers.s3_provider import S3Provider
 from backend.cellguide.common.utils import get_object_key
 
+_marker_gene_data_cache = None
+
 
 def _defaultdict_to_dict(d):
     if isinstance(d, defaultdict):
@@ -33,6 +35,27 @@ def _initialize_cellguide_marker_gene_dict():
         ),
     )
     marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8"))
+
+    return format_marker_gene_data(marker_gene_data)
+
+
+def format_marker_gene_data(marker_gene_data):
+    """
+    Reformat the raw marker gene data into a structured dictionary.
+
+    The function transforms the raw marker gene data into a nested dictionary format organized by organism, tissue,
+    and cell type, where each cell type contains a list of marker genes with their respective scores and properties.
+
+    Parameters:
+        marker_gene_data (dict): The raw marker gene data loaded from JSON, expected to have a structure where each
+                                 gene is mapped to organisms, which in turn map to tissues, and then to a list of
+                                 marker details.
+
+    Returns:
+        dict: A nested dictionary with the structure {organism: {tissue: {cell_type_id: [marker details]}}}.
+              Each marker detail is a dictionary containing the gene, marker score, and other properties.
+    """
+
     data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
 
     for gene in marker_gene_data:
@@ -53,9 +76,6 @@ def _initialize_cellguide_marker_gene_dict():
     return data
 
 
-_marker_gene_data_cache = None
-
-
 def get_marker_gene_data():
     global _marker_gene_data_cache
     if _marker_gene_data_cache is None:

diff --git a/backend/cellguide/pipeline/computational_marker_genes/__init__.py b/backend/cellguide/pipeline/computational_marker_genes/__init__.py
@@ -1,6 +1,11 @@
 import logging
 
-from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
+from backend.cellguide.common.constants import (
+    COMPUTATIONAL_MARKER_GENES_FOLDERNAME,
+    MARKER_GENE_DATA_FILENAME,
+    MARKER_GENE_PRESENCE_FILENAME,
+)
+from backend.cellguide.common.data import format_marker_gene_data
 from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
     MARKER_SCORE_THRESHOLD,
     MarkerGenesCalculator,
@@ -17,7 +22,7 @@
 def run(*, output_directory: str):
     snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
     ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
-    marker_genes, reformatted_marker_genes = get_computational_marker_genes(
+    marker_genes, reformatted_marker_genes, formatted_marker_gene_data = get_computational_marker_genes(
         snapshot=snapshot,
         ontology_tree=ontology_tree,
     )
@@ -26,6 +31,10 @@ def run(*, output_directory: str):
         reformatted_marker_genes,
         f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}",
     )
+    output_json(
+        formatted_marker_gene_data,
+        f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_DATA_FILENAME}",
+    )
 
 
 def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: OntologyTreeBuilder) -> tuple[dict, dict]:
@@ -114,11 +123,6 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
             )
             reformatted_marker_genes[symbol][organism][tissue].append(data)
 
-    # # assert that cell types do not appear multiple times in each gene, tissue, organism
-    # for symbol in reformatted_marker_genes:
-    #     for organism in reformatted_marker_genes[symbol]:
-    #         for tissue in reformatted_marker_genes[symbol][organism]:
-    #             cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
-    #             assert len(cell_type_ids) == len(list(set(cell_type_ids)))
-
-    return marker_genes, reformatted_marker_genes
+    # reformat the data to be a nested dictionary with structure organism-->tissue-->celltype-->genes
+    organism_tissue_celltype_genes_data = format_marker_gene_data(reformatted_marker_genes)
+    return marker_genes, reformatted_marker_genes, organism_tissue_celltype_genes_data
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -196,6 +196,7 @@ services:
         - HAPPY_BRANCH=$HAPPY_BRANCH
         - INSTALL_DEV=$INSTALL_DEV
         - HAPPY_TAG
+        - CACHEBUST=$CACHEBUST
     restart: "no"
     volumes:
       - ./backend/portal/pipeline/processing:/backend/portal/pipeline/processing

diff --git a/frontend/doc-site/032__Contribute and Publish Data.mdx b/frontend/doc-site/032__Contribute and Publish Data.mdx
@@ -22,6 +22,7 @@ CELLxGENE is focused on supporting the global community attempting to create ref
 - drug screens
 - cell lines
 - organisms other than mouse or human
+- assays not on the [Census accepted assays list](https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/census_accepted_assays.csv)
 
 ### Scale Constraints
 

diff --git a/frontend/package-lock.json b/frontend/package-lock.json
diff --git a/frontend/package.json b/frontend/package.json
@@ -75,7 +75,7 @@
     "@types/pako": "^2.0.0",
     "@types/papaparse": "^5.3.7",
     "@types/pixelmatch": "^5.2.4",
-    "@types/pngjs": "^6.0.1",
+    "@types/pngjs": "^6.0.5",
     "@types/react": "18.2.42",
     "@types/react-dom": "^18.2.18",
     "@types/react-highlight": "^0.12.8",
@@ -102,7 +102,7 @@
     "pixelmatch": "^5.3.0",
     "prettier": "^3.0.2",
     "prettier-plugin-organize-imports": "^2.3.4",
-    "sharp": "^0.32.2",
+    "sharp": "^0.32.6",
     "start-server-and-test": "^1.15.4",
     "stylelint": "^13.13.1",
     "stylelint-config-recommended": "^3.0.0",

diff --git a/python_dependencies/processing/requirements.txt b/python_dependencies/processing/requirements.txt
@@ -20,7 +20,7 @@ pydantic>=1.9.0
 PyMySQL==0.9.3
 python-json-logger
 requests>=2.22.0
-rpy2==3.5.14
+rpy2==3.5.16
 rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability
 s3fs==0.4.2
 scanpy==1.9.3

diff --git a/scripts/generate_cellguide_pipeline_test_fixtures.py b/scripts/generate_cellguide_pipeline_test_fixtures.py
@@ -33,6 +33,7 @@
     CELLTYPE_ONTOLOGY_TREE_STATE_FIXTURE_FILENAME,
     CELLTYPE_TO_TISSUE_MAPPING_FILENAME,
     COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
+    FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
     ONTOLOGY_GRAPH_FIXTURE_FILENAME,
     ONTOLOGY_TREE_TOPLEVEL_FOLDERNAME,
     REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
@@ -195,8 +196,8 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
                 "backend.cellguide.pipeline.computational_marker_genes.computational_markers.bootstrap_rows_percentiles",
                 new=mock_bootstrap_rows_percentiles,
             ):
-                computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
-                    snapshot=snapshot, ontology_tree=ontology_tree
+                computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
+                    get_computational_marker_genes(snapshot=snapshot, ontology_tree=ontology_tree)
                 )
                 output_json(
                     computational_marker_genes,
@@ -206,6 +207,10 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
                     reformatted_marker_genes,
                     f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
                 )
+                output_json(
+                    formatted_marker_genes,
+                    f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
+                )
 
 
 if __name__ == "__main__":

diff --git a/tests/unit/backend/cellguide/pipeline/constants.py b/tests/unit/backend/cellguide/pipeline/constants.py
@@ -3,6 +3,7 @@
 CANONICAL_MARKER_GENES_FIXTURE_FILENAME = "canonical_marker_genes.json"
 COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "computational_marker_genes.json"
 REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "reformatted_computational_marker_genes.json"
+FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "formatted_computational_marker_genes.json"
 TISSUE_METADATA_FIXTURE_FILENAME = "tissue_metadata.json"
 CELLTYPE_METADATA_FIXTURE_FILENAME = "cell_metadata.json"
 ONTOLOGY_GRAPH_FIXTURE_FILENAME = "ontology_graph.json"

diff --git a/tests/unit/backend/cellguide/pipeline/fixtures/formatted_computational_marker_genes.json b/tests/unit/backend/cellguide/pipeline/fixtures/formatted_computational_marker_genes.json
diff --git a/tests/unit/backend/cellguide/pipeline/test_computational_marker_genes.py b/tests/unit/backend/cellguide/pipeline/test_computational_marker_genes.py
@@ -10,6 +10,7 @@
 from tests.unit.backend.cellguide.pipeline.constants import (
     CELLGUIDE_PIPELINE_FIXTURES_BASEPATH,
     COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
+    FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
     REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
 )
 from tests.unit.backend.wmg.fixtures.test_snapshot import (
@@ -28,6 +29,10 @@ def test__marker_gene_calculation(self):
             f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
         ) as f:
             expected__reformatted_marker_genes = json.load(f)
+        with open(
+            f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
+        ) as f:
+            expected__formatted_marker_genes = json.load(f)
         with (
             load_realistic_test_snapshot(TEST_SNAPSHOT) as snapshot,
             patch(
@@ -37,11 +42,15 @@ def test__marker_gene_calculation(self):
         ):
             cell_counts_df = snapshot.cell_counts_cube.df[:]
             tree_builder = OntologyTreeBuilder(cell_counts_df)
-            computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
-                snapshot=snapshot,
-                ontology_tree=tree_builder,
+            computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
+                get_computational_marker_genes(
+                    snapshot=snapshot,
+                    ontology_tree=tree_builder,
+                )
             )
             computational_marker_genes = convert_dataclass_to_dict_and_strip_nones(computational_marker_genes)
             reformatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(reformatted_marker_genes)
+            formatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(formatted_marker_genes)
             self.assertTrue(compare_dicts(computational_marker_genes, expected__computational_marker_genes))
             self.assertTrue(compare_dicts(reformatted_marker_genes, expected__reformatted_marker_genes))
+            self.assertTrue(compare_dicts(formatted_marker_genes, expected__formatted_marker_genes))