Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/python_dependencies/processin…
Browse files Browse the repository at this point in the history
…g/rpy2-3.5.16
  • Loading branch information
nayib-jose-gloria authored May 14, 2024
2 parents 0da5b60 + 1729cda commit 4ac6d43
Show file tree
Hide file tree
Showing 18 changed files with 101 additions and 44 deletions.
1 change: 1 addition & 0 deletions Dockerfile.cellguide_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ADD backend/wmg/config.py backend/wmg/config.py
ADD backend/wmg/data backend/wmg/data
ADD backend/wmg/api backend/wmg/api
ADD backend/cellguide/pipeline backend/cellguide/pipeline
ADD backend/cellguide/common backend/cellguide/common
ADD backend/common backend/common

ARG HAPPY_BRANCH="unknown"
Expand Down
1 change: 1 addition & 0 deletions backend/cellguide/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
GPT_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_descriptions"
GPT_SEO_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_seo_descriptions"
MARKER_GENE_PRESENCE_FILENAME = "marker_gene_presence.json.gz"
MARKER_GENE_DATA_FILENAME = "marker_gene_data.json.gz"
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from backend.cellguide.common.providers.s3_provider import S3Provider
from backend.cellguide.common.utils import get_object_key

_marker_gene_data_cache = None


def _defaultdict_to_dict(d):
if isinstance(d, defaultdict):
Expand All @@ -33,6 +35,27 @@ def _initialize_cellguide_marker_gene_dict():
),
)
marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8"))

return format_marker_gene_data(marker_gene_data)


def format_marker_gene_data(marker_gene_data):
"""
Reformat the raw marker gene data into a structured dictionary.
The function transforms the raw marker gene data into a nested dictionary format organized by organism, tissue,
and cell type, where each cell type contains a list of marker genes with their respective scores and properties.
Parameters:
marker_gene_data (dict): The raw marker gene data loaded from JSON, expected to have a structure where each
gene is mapped to organisms, which in turn map to tissues, and then to a list of
marker details.
Returns:
dict: A nested dictionary with the structure {organism: {tissue: {cell_type_id: [marker details]}}}.
Each marker detail is a dictionary containing the gene, marker score, and other properties.
"""

data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for gene in marker_gene_data:
Expand All @@ -53,9 +76,6 @@ def _initialize_cellguide_marker_gene_dict():
return data


_marker_gene_data_cache = None


def get_marker_gene_data():
global _marker_gene_data_cache
if _marker_gene_data_cache is None:
Expand Down
24 changes: 14 additions & 10 deletions backend/cellguide/pipeline/computational_marker_genes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging

from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
from backend.cellguide.common.constants import (
COMPUTATIONAL_MARKER_GENES_FOLDERNAME,
MARKER_GENE_DATA_FILENAME,
MARKER_GENE_PRESENCE_FILENAME,
)
from backend.cellguide.common.data import format_marker_gene_data
from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
MARKER_SCORE_THRESHOLD,
MarkerGenesCalculator,
Expand All @@ -17,7 +22,7 @@
def run(*, output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
marker_genes, reformatted_marker_genes = get_computational_marker_genes(
marker_genes, reformatted_marker_genes, formatted_marker_gene_data = get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=ontology_tree,
)
Expand All @@ -26,6 +31,10 @@ def run(*, output_directory: str):
reformatted_marker_genes,
f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}",
)
output_json(
formatted_marker_gene_data,
f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_DATA_FILENAME}",
)


def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: OntologyTreeBuilder) -> tuple[dict, dict]:
Expand Down Expand Up @@ -114,11 +123,6 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
)
reformatted_marker_genes[symbol][organism][tissue].append(data)

# # assert that cell types do not appear multiple times in each gene, tissue, organism
# for symbol in reformatted_marker_genes:
# for organism in reformatted_marker_genes[symbol]:
# for tissue in reformatted_marker_genes[symbol][organism]:
# cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
# assert len(cell_type_ids) == len(list(set(cell_type_ids)))

return marker_genes, reformatted_marker_genes
# reformat the data to be a nested dictionary with structure organism-->tissue-->celltype-->genes
organism_tissue_celltype_genes_data = format_marker_gene_data(reformatted_marker_genes)
return marker_genes, reformatted_marker_genes, organism_tissue_celltype_genes_data
11 changes: 5 additions & 6 deletions backend/layers/processing/process_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,11 @@ def get_spatial_metadata(self, spatial_dict: Dict[str, Any]) -> Optional[Spatial
"""
is_single = spatial_dict.get("is_single")
has_fullres = False
# schema validation ensures nested 'fullres' key is only included when is_single is True
if is_single:
# schema validation ensures there can only be one other key in uns["spatial"] if "is_single" is True
library_id = [key for key in spatial_dict if key != "is_single"][0]
if "fullres" in spatial_dict[library_id]["images"]:
has_fullres = True
spatial_library_ids = [key for key in spatial_dict if key != "is_single"]
# schema validation ensures there can only be at max, one other key in uns["spatial"] if "is_single" is True
library_id = spatial_library_ids.pop() if spatial_library_ids else None
if library_id and "images" in spatial_dict[library_id] and "fullres" in spatial_dict[library_id]["images"]:
has_fullres = True
return SpatialMetadata(is_single=bool(is_single), has_fullres=has_fullres)

@logit
Expand Down
20 changes: 5 additions & 15 deletions frontend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,8 @@ loginState.json
public/sitemap*
public/robots.txt

# Cell Guide
src/views/CellGuide/common/fixtures/allCellTypeDescriptions.json
src/views/CellGuide/common/fixtures/allCellTypeDescriptionsSEO.json
src/views/CellGuide/common/fixtures/allCellTypeMarkerGenes.json
src/views/CellGuide/common/fixtures/allCellTypeOwlDescriptions.json
src/views/CellGuide/common/fixtures/allCellTypes.json
src/views/CellGuide/common/fixtures/allEnrichedGenes/
src/views/CellGuide/common/fixtures/allSourceData.json
src/views/CellGuide/common/fixtures/allTissueDescriptions.json
src/views/CellGuide/common/fixtures/allTissues.json
src/views/CellGuide/common/fixtures/ontologyTree.json
src/views/CellGuide/common/fixtures/ontologyTreeStatePerCellType.json
src/views/CellGuide/common/fixtures/ontologyTreeStatePerTissue.json

certificates
# Filter descendants
src/components/common/Filter/descendant_mappings/cell_type_descendants.json
src/components/common/Filter/descendant_mappings/tissue_descendants.json

certificates
11 changes: 11 additions & 0 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,25 @@ RUN apt-get update && apt-get install -y make wget \
lsb-release xdg-utils nano vim procps moreutils ripgrep \
&& rm -rf /var/lib/apt/lists/*

# Create the directory `make retrieve-descendants` expects before running npm ci, which runs `make retrieve-descendants`
# as a preinstall script
RUN mkdir -p src/components/common/Filter/descendant_mappings

COPY Makefile ./
COPY package*.json ./
RUN npm ci --verbose --no-optional && npm cache clean --force

ENV PATH /opt/node_app/node_modules/.bin:$PATH

# -- TODO, we should try turning this back on later.
# ADD --chown=node . /corpora-frontend
ADD . /corpora-frontend

# Explicitly copy the JSON files from the earlier steps
RUN cp -r /opt/node_app/src/components/common/Filter/descendant_mappings/* /corpora-frontend/src/components/common/Filter/descendant_mappings/

WORKDIR /corpora-frontend

ADD ./src/configs/build.js src/configs/configs.js
RUN mkdir -p node_modules
RUN ln -sf /opt/node_app/node_modules/* /opt/node_app/node_modules/.bin ./node_modules/.
Expand Down
1 change: 1 addition & 0 deletions frontend/doc-site/032__Contribute and Publish Data.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ CELLxGENE is focused on supporting the global community attempting to create ref
- drug screens
- cell lines
- organisms other than mouse or human
- assays not on the [Census accepted assays list](https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/census_accepted_assays.csv)

### Scale Constraints

Expand Down
1 change: 1 addition & 0 deletions frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@
],
"license": "MIT",
"scripts": {
"predev": "make retrieve-descendants",
"prebuild": "make retrieve-descendants",
"preinstall": "make retrieve-descendants",
"dev": "next dev --experimental-https",
"build": "NODE_OPTIONS=\"--max_old_space_size=2048\" next build",
"start": "next start",
Expand Down

This file was deleted.

This file was deleted.

10 changes: 10 additions & 0 deletions frontend/src/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,13 @@ namespace HubSpotFormAPI {
}

declare const hbspt: HubSpotFormAPI.HubSpot;

declare module "src/components/common/Filter/descendant_mappings/cell_type_descendants.json" {
const value: { [key: string]: string[] };
export default value;
}

declare module "src/components/common/Filter/descendant_mappings/tissue_descendants.json" {
const value: { [key: string]: string[] };
export default value;
}
9 changes: 7 additions & 2 deletions scripts/generate_cellguide_pipeline_test_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
CELLTYPE_ONTOLOGY_TREE_STATE_FIXTURE_FILENAME,
CELLTYPE_TO_TISSUE_MAPPING_FILENAME,
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
ONTOLOGY_GRAPH_FIXTURE_FILENAME,
ONTOLOGY_TREE_TOPLEVEL_FOLDERNAME,
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
Expand Down Expand Up @@ -195,8 +196,8 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
"backend.cellguide.pipeline.computational_marker_genes.computational_markers.bootstrap_rows_percentiles",
new=mock_bootstrap_rows_percentiles,
):
computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
snapshot=snapshot, ontology_tree=ontology_tree
computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
get_computational_marker_genes(snapshot=snapshot, ontology_tree=ontology_tree)
)
output_json(
computational_marker_genes,
Expand All @@ -206,6 +207,10 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
reformatted_marker_genes,
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
)
output_json(
formatted_marker_genes,
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions tests/unit/backend/cellguide/pipeline/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
CANONICAL_MARKER_GENES_FIXTURE_FILENAME = "canonical_marker_genes.json"
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "computational_marker_genes.json"
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "reformatted_computational_marker_genes.json"
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "formatted_computational_marker_genes.json"
TISSUE_METADATA_FIXTURE_FILENAME = "tissue_metadata.json"
CELLTYPE_METADATA_FIXTURE_FILENAME = "cell_metadata.json"
ONTOLOGY_GRAPH_FIXTURE_FILENAME = "ontology_graph.json"
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from tests.unit.backend.cellguide.pipeline.constants import (
CELLGUIDE_PIPELINE_FIXTURES_BASEPATH,
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
)
from tests.unit.backend.wmg.fixtures.test_snapshot import (
Expand All @@ -28,6 +29,10 @@ def test__marker_gene_calculation(self):
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
) as f:
expected__reformatted_marker_genes = json.load(f)
with open(
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
) as f:
expected__formatted_marker_genes = json.load(f)
with (
load_realistic_test_snapshot(TEST_SNAPSHOT) as snapshot,
patch(
Expand All @@ -37,11 +42,15 @@ def test__marker_gene_calculation(self):
):
cell_counts_df = snapshot.cell_counts_cube.df[:]
tree_builder = OntologyTreeBuilder(cell_counts_df)
computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=tree_builder,
computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=tree_builder,
)
)
computational_marker_genes = convert_dataclass_to_dict_and_strip_nones(computational_marker_genes)
reformatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(reformatted_marker_genes)
formatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(formatted_marker_genes)
self.assertTrue(compare_dicts(computational_marker_genes, expected__computational_marker_genes))
self.assertTrue(compare_dicts(reformatted_marker_genes, expected__reformatted_marker_genes))
self.assertTrue(compare_dicts(formatted_marker_genes, expected__formatted_marker_genes))
8 changes: 7 additions & 1 deletion tests/unit/processing/test_extract_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def test_get_spatial_metadata__is_single_and_fullres_true(self):
}
self.assertEqual(self.pdv.get_spatial_metadata(spatial_dict), SpatialMetadata(is_single=True, has_fullres=True))

def test_get_spatial_metadata__is_single_true_fullres_false(self):
def test_get_spatial_metadata__is_single_true_and_fullres_false(self):
spatial_dict = {
"is_single": True,
"dummy_library_id": {"images": {}},
Expand All @@ -299,6 +299,12 @@ def test_get_spatial_metadata__is_single_true_fullres_false(self):
self.pdv.get_spatial_metadata(spatial_dict), SpatialMetadata(is_single=True, has_fullres=False)
)

def test_get_spatial_metadata__is_single_true_and_no_library_id(self):
spatial_dict = {"is_single": np.bool_(True)}
self.assertEqual(
self.pdv.get_spatial_metadata(spatial_dict), SpatialMetadata(is_single=True, has_fullres=False)
)

def test_get_spatial_metadata__is_single_false(self):
spatial_dict = {"is_single": np.bool_(False)}
self.assertEqual(
Expand Down

0 comments on commit 4ac6d43

Please sign in to comment.