Skip to content

Commit

Permalink
Merging staging branch into prod branch
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed May 16, 2024
2 parents 43ea7ab + d149bc5 commit 1f5fa6b
Show file tree
Hide file tree
Showing 15 changed files with 92 additions and 43 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rebuild-processing-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ jobs:
shell: bash
run: |
export BRANCH_SHA=$(git rev-parse --short=8 HEAD)
# Cache busting to ensure the image is always rebuilt.
export CACHEBUST=$(date +%s)
happy push "" --aws-profile "" --tag sha-${BRANCH_SHA} --slice processing
- name: Alert in Slack
uses: 8398a7/action-slack@v3
Expand Down
1 change: 1 addition & 0 deletions Dockerfile.cellguide_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ADD backend/wmg/config.py backend/wmg/config.py
ADD backend/wmg/data backend/wmg/data
ADD backend/wmg/api backend/wmg/api
ADD backend/cellguide/pipeline backend/cellguide/pipeline
ADD backend/cellguide/common backend/cellguide/common
ADD backend/common backend/common

ARG HAPPY_BRANCH="unknown"
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile.processing
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ ENV PATH="/opt/venv/bin:$PATH"
COPY /python_dependencies/processing/ .
COPY /python_dependencies/common/ .
ARG INSTALL_DEV=false

ARG CACHEBUST=1
RUN python3.10 -m pip install -r requirements.txt

RUN if [ "$INSTALL_DEV" = "true" ]; then python3 -m pip install -r requirements-dev.txt; fi

ADD backend/__init__.py backend/__init__.py
Expand Down
1 change: 1 addition & 0 deletions backend/cellguide/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
GPT_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_descriptions"
GPT_SEO_OUTPUT_DIRECTORY_FOLDERNAME = "gpt_seo_descriptions"
MARKER_GENE_PRESENCE_FILENAME = "marker_gene_presence.json.gz"
MARKER_GENE_DATA_FILENAME = "marker_gene_data.json.gz"
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from backend.cellguide.common.providers.s3_provider import S3Provider
from backend.cellguide.common.utils import get_object_key

_marker_gene_data_cache = None


def _defaultdict_to_dict(d):
if isinstance(d, defaultdict):
Expand All @@ -33,6 +35,27 @@ def _initialize_cellguide_marker_gene_dict():
),
)
marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8"))

return format_marker_gene_data(marker_gene_data)


def format_marker_gene_data(marker_gene_data):
"""
Reformat the raw marker gene data into a structured dictionary.
The function transforms the raw marker gene data into a nested dictionary format organized by organism, tissue,
and cell type, where each cell type contains a list of marker genes with their respective scores and properties.
Parameters:
marker_gene_data (dict): The raw marker gene data loaded from JSON, expected to have a structure where each
gene is mapped to organisms, which in turn map to tissues, and then to a list of
marker details.
Returns:
dict: A nested dictionary with the structure {organism: {tissue: {cell_type_id: [marker details]}}}.
Each marker detail is a dictionary containing the gene, marker score, and other properties.
"""

data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for gene in marker_gene_data:
Expand All @@ -53,9 +76,6 @@ def _initialize_cellguide_marker_gene_dict():
return data


_marker_gene_data_cache = None


def get_marker_gene_data():
global _marker_gene_data_cache
if _marker_gene_data_cache is None:
Expand Down
24 changes: 14 additions & 10 deletions backend/cellguide/pipeline/computational_marker_genes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import logging

from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
from backend.cellguide.common.constants import (
COMPUTATIONAL_MARKER_GENES_FOLDERNAME,
MARKER_GENE_DATA_FILENAME,
MARKER_GENE_PRESENCE_FILENAME,
)
from backend.cellguide.common.data import format_marker_gene_data
from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
MARKER_SCORE_THRESHOLD,
MarkerGenesCalculator,
Expand All @@ -17,7 +22,7 @@
def run(*, output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
marker_genes, reformatted_marker_genes = get_computational_marker_genes(
marker_genes, reformatted_marker_genes, formatted_marker_gene_data = get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=ontology_tree,
)
Expand All @@ -26,6 +31,10 @@ def run(*, output_directory: str):
reformatted_marker_genes,
f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}",
)
output_json(
formatted_marker_gene_data,
f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_DATA_FILENAME}",
)


def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: OntologyTreeBuilder) -> tuple[dict, dict]:
Expand Down Expand Up @@ -114,11 +123,6 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
)
reformatted_marker_genes[symbol][organism][tissue].append(data)

# # assert that cell types do not appear multiple times in each gene, tissue, organism
# for symbol in reformatted_marker_genes:
# for organism in reformatted_marker_genes[symbol]:
# for tissue in reformatted_marker_genes[symbol][organism]:
# cell_type_ids = [i["cell_type_id"] for i in reformatted_marker_genes[symbol][organism][tissue]]
# assert len(cell_type_ids) == len(list(set(cell_type_ids)))

return marker_genes, reformatted_marker_genes
# reformat the data to be a nested dictionary with structure organism-->tissue-->celltype-->genes
organism_tissue_celltype_genes_data = format_marker_gene_data(reformatted_marker_genes)
return marker_genes, reformatted_marker_genes, organism_tissue_celltype_genes_data
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ services:
- HAPPY_BRANCH=$HAPPY_BRANCH
- INSTALL_DEV=$INSTALL_DEV
- HAPPY_TAG
- CACHEBUST=$CACHEBUST
restart: "no"
volumes:
- ./backend/portal/pipeline/processing:/backend/portal/pipeline/processing
Expand Down
1 change: 1 addition & 0 deletions frontend/doc-site/032__Contribute and Publish Data.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ CELLxGENE is focused on supporting the global community attempting to create ref
- drug screens
- cell lines
- organisms other than mouse or human
- assays not on the [Census accepted assays list](https://github.com/chanzuckerberg/cellxgene-census/blob/main/docs/census_accepted_assays.csv)

### Scale Constraints

Expand Down
44 changes: 22 additions & 22 deletions frontend/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions frontend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"@types/pako": "^2.0.0",
"@types/papaparse": "^5.3.7",
"@types/pixelmatch": "^5.2.4",
"@types/pngjs": "^6.0.1",
"@types/pngjs": "^6.0.5",
"@types/react": "18.2.42",
"@types/react-dom": "^18.2.18",
"@types/react-highlight": "^0.12.8",
Expand All @@ -102,7 +102,7 @@
"pixelmatch": "^5.3.0",
"prettier": "^3.0.2",
"prettier-plugin-organize-imports": "^2.3.4",
"sharp": "^0.32.2",
"sharp": "^0.32.6",
"start-server-and-test": "^1.15.4",
"stylelint": "^13.13.1",
"stylelint-config-recommended": "^3.0.0",
Expand Down
2 changes: 1 addition & 1 deletion python_dependencies/processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pydantic>=1.9.0
PyMySQL==0.9.3
python-json-logger
requests>=2.22.0
rpy2==3.5.14
rpy2==3.5.16
rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability
s3fs==0.4.2
scanpy==1.9.3
Expand Down
9 changes: 7 additions & 2 deletions scripts/generate_cellguide_pipeline_test_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
CELLTYPE_ONTOLOGY_TREE_STATE_FIXTURE_FILENAME,
CELLTYPE_TO_TISSUE_MAPPING_FILENAME,
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
ONTOLOGY_GRAPH_FIXTURE_FILENAME,
ONTOLOGY_TREE_TOPLEVEL_FOLDERNAME,
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
Expand Down Expand Up @@ -195,8 +196,8 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
"backend.cellguide.pipeline.computational_marker_genes.computational_markers.bootstrap_rows_percentiles",
new=mock_bootstrap_rows_percentiles,
):
computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
snapshot=snapshot, ontology_tree=ontology_tree
computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
get_computational_marker_genes(snapshot=snapshot, ontology_tree=ontology_tree)
)
output_json(
computational_marker_genes,
Expand All @@ -206,6 +207,10 @@ def run_cellguide_pipeline(fixture_type: FixtureType):
reformatted_marker_genes,
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
)
output_json(
formatted_marker_genes,
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}",
)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions tests/unit/backend/cellguide/pipeline/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
CANONICAL_MARKER_GENES_FIXTURE_FILENAME = "canonical_marker_genes.json"
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "computational_marker_genes.json"
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "reformatted_computational_marker_genes.json"
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME = "formatted_computational_marker_genes.json"
TISSUE_METADATA_FIXTURE_FILENAME = "tissue_metadata.json"
CELLTYPE_METADATA_FIXTURE_FILENAME = "cell_metadata.json"
ONTOLOGY_GRAPH_FIXTURE_FILENAME = "ontology_graph.json"
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from tests.unit.backend.cellguide.pipeline.constants import (
CELLGUIDE_PIPELINE_FIXTURES_BASEPATH,
COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME,
)
from tests.unit.backend.wmg.fixtures.test_snapshot import (
Expand All @@ -28,6 +29,10 @@ def test__marker_gene_calculation(self):
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{REFORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
) as f:
expected__reformatted_marker_genes = json.load(f)
with open(
f"{CELLGUIDE_PIPELINE_FIXTURES_BASEPATH}/{FORMATTED_COMPUTATIONAL_MARKER_GENES_FIXTURE_FILENAME}", "r"
) as f:
expected__formatted_marker_genes = json.load(f)
with (
load_realistic_test_snapshot(TEST_SNAPSHOT) as snapshot,
patch(
Expand All @@ -37,11 +42,15 @@ def test__marker_gene_calculation(self):
):
cell_counts_df = snapshot.cell_counts_cube.df[:]
tree_builder = OntologyTreeBuilder(cell_counts_df)
computational_marker_genes, reformatted_marker_genes = get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=tree_builder,
computational_marker_genes, reformatted_marker_genes, formatted_marker_genes = (
get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=tree_builder,
)
)
computational_marker_genes = convert_dataclass_to_dict_and_strip_nones(computational_marker_genes)
reformatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(reformatted_marker_genes)
formatted_marker_genes = convert_dataclass_to_dict_and_strip_nones(formatted_marker_genes)
self.assertTrue(compare_dicts(computational_marker_genes, expected__computational_marker_genes))
self.assertTrue(compare_dicts(reformatted_marker_genes, expected__reformatted_marker_genes))
self.assertTrue(compare_dicts(formatted_marker_genes, expected__formatted_marker_genes))

0 comments on commit 1f5fa6b

Please sign in to comment.