Skip to content

Commit

Permalink
MERGE COMMIT
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed May 29, 2024
2 parents f4eef42 + e00ff78 commit 11a520f
Show file tree
Hide file tree
Showing 100 changed files with 878 additions and 1,998 deletions.
3 changes: 0 additions & 3 deletions Dockerfile.cellguide_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ RUN pip3 install -r requirements.txt
RUN if [ "$INSTALL_DEV" = "true" ]; then python3 -m pip install -r requirements-dev.txt; fi

ADD backend/__init__.py backend/__init__.py
ADD backend/wmg/config.py backend/wmg/config.py
ADD backend/wmg/data backend/wmg/data
ADD backend/wmg/api backend/wmg/api
ADD backend/cellguide/pipeline backend/cellguide/pipeline
ADD backend/cellguide/common backend/cellguide/common
ADD backend/common backend/common
Expand Down
6 changes: 0 additions & 6 deletions Dockerfile.wmg_pipeline
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,7 @@ RUN pip3 install -r requirements.txt
RUN if [ "$INSTALL_DEV" = "true" ]; then python3 -m pip install -r requirements-dev.txt; fi

ADD backend/__init__.py backend/__init__.py
ADD backend/wmg/__init__.py backend/wmg/__init__.py
ADD backend/wmg/config.py backend/wmg/config.py
ADD backend/wmg/data backend/wmg/data
ADD backend/wmg/pipeline backend/wmg/pipeline
ADD backend/wmg/api backend/wmg/api
ADD backend/cellguide backend/cellguide
ADD backend/layers backend/layers
ADD backend/common backend/common

ARG HAPPY_BRANCH="unknown"
Expand Down
1 change: 0 additions & 1 deletion backend/api_server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def add_api(base_path, spec_file):
add_api(base_path="/cellguide", spec_file="cellguide/api/cellguide-api.yml")
curation_api = add_api(base_path="/curation", spec_file="curation/api/curation-api.yml")
curation_api.blueprint.json_encoder = CurationJSONEncoder
add_api(base_path="/wmg", spec_file="wmg/api/wmg-api.yml")
add_api(base_path="/wmg/v2", spec_file="wmg/api/wmg-api-v2.yml")
add_api(base_path="/de", spec_file="de/api/de-api.yml")
add_api(base_path="/gene_info", spec_file="gene_info/api/gene-info-api.yml")
Expand Down
23 changes: 11 additions & 12 deletions backend/cellguide/pipeline/canonical_marker_genes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,24 @@

from backend.cellguide.common.constants import CANONICAL_MARKER_GENES_FOLDERNAME
from backend.cellguide.pipeline.canonical_marker_genes.canonical_markers import CanonicalMarkerGenesCompiler
from backend.cellguide.pipeline.constants import HOMO_SAPIENS_ORGANISM_ONTOLOGY_TERM_ID
from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder
from backend.cellguide.pipeline.constants import (
CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION,
HOMO_SAPIENS_ORGANISM_ONTOLOGY_TERM_ID,
)
from backend.cellguide.pipeline.utils import output_json_per_key
from backend.wmg.api.wmg_api_config import WMG_API_SNAPSHOT_SCHEMA_VERSION
from backend.wmg.data.snapshot import WmgSnapshot, load_snapshot
from backend.common.census_cube.data import snapshot as sn
from backend.common.census_cube.utils import get_all_cell_type_ids_in_corpus

logger = logging.getLogger(__name__)


def run(*, output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
data = get_canonical_marker_genes(snapshot=snapshot, ontology_tree=ontology_tree)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)
data = get_canonical_marker_genes(snapshot=snapshot)
output_json_per_key(data, f"{output_directory}/{CANONICAL_MARKER_GENES_FOLDERNAME}")


def get_canonical_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: OntologyTreeBuilder) -> dict:
def get_canonical_marker_genes(*, snapshot: sn.CensusCubeSnapshot) -> dict:
wmg_tissues = [
next(iter(i.keys()))
for i in snapshot.primary_filter_dimensions["tissue_terms"][HOMO_SAPIENS_ORGANISM_ONTOLOGY_TERM_ID]
Expand All @@ -31,9 +31,8 @@ def get_canonical_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Ontology
marker_gene_compiler = CanonicalMarkerGenesCompiler(wmg_tissues=wmg_tissues, wmg_human_genes=wmg_human_genes)
parsed_asctb_table_entries = marker_gene_compiler.get_processed_asctb_table_entries()

num_cell_types_in_corpus = len(
set(parsed_asctb_table_entries).intersection(ontology_tree.all_cell_type_ids_in_corpus)
)
all_cell_type_ids_in_corpus = get_all_cell_type_ids_in_corpus(snapshot)
num_cell_types_in_corpus = len(set(parsed_asctb_table_entries).intersection(all_cell_type_ids_in_corpus))
logger.info(
f"Parsed {len(parsed_asctb_table_entries)} cell types in ASCTB, out of which {num_cell_types_in_corpus} are in CellGuide."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
get_title_and_citation_from_doi,
)
from backend.cellguide.pipeline.constants import ASCTB_MASTER_SHEET_URL, CELLGUIDE_PIPELINE_NUM_CPUS
from backend.cellguide.pipeline.utils import get_gene_id_to_name_and_symbol
from backend.wmg.data.ontology_labels import ontology_term_label
from backend.wmg.data.utils import setup_retry_session
from backend.common.census_cube.data.ontology_labels import ontology_term_label
from backend.common.census_cube.utils import setup_retry_session
from backend.common.marker_genes.marker_gene_files.gene_metadata import get_gene_id_to_name_and_symbol

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion backend/cellguide/pipeline/canonical_marker_genes/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from requests import Response

from backend.wmg.data.utils import setup_retry_session
from backend.common.census_cube.utils import setup_retry_session


def clean_doi(doi: str) -> str:
Expand Down
22 changes: 7 additions & 15 deletions backend/cellguide/pipeline/computational_marker_genes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,20 @@
MARKER_GENE_PRESENCE_FILENAME,
)
from backend.cellguide.common.data import format_marker_gene_data
from backend.cellguide.pipeline.computational_marker_genes.computational_markers import (
MARKER_SCORE_THRESHOLD,
MarkerGenesCalculator,
)
from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder
from backend.cellguide.pipeline.constants import CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION
from backend.cellguide.pipeline.utils import output_json, output_json_per_key
from backend.wmg.api.wmg_api_config import WMG_API_SNAPSHOT_SCHEMA_VERSION
from backend.wmg.data.snapshot import WmgSnapshot, load_snapshot
from backend.common.census_cube.data import snapshot as sn
from backend.common.marker_genes.computational_markers import MarkerGenesCalculator
from backend.common.marker_genes.constants import MARKER_SCORE_THRESHOLD

logger = logging.getLogger(__name__)


def run(*, output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)

marker_genes, reformatted_marker_genes, formatted_marker_gene_data = get_computational_marker_genes(
snapshot=snapshot,
ontology_tree=ontology_tree,
)
output_json_per_key(marker_genes, f"{output_directory}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}")
output_json(
Expand All @@ -37,7 +32,7 @@ def run(*, output_directory: str):
)


def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: OntologyTreeBuilder) -> tuple[dict, dict]:
def get_computational_marker_genes(*, snapshot: sn.CensusCubeSnapshot) -> tuple[dict, dict]:
"""
This function calculates the marker genes per tissue and across tissues.
Expand All @@ -50,18 +45,15 @@ def get_computational_marker_genes(*, snapshot: WmgSnapshot, ontology_tree: Onto
-------
dict - A dictionary containing the marker genes per tissue and across tissues keyed by cell type ontology term ID.
"""
all_cell_types_in_corpus = ontology_tree.all_cell_type_ids_in_corpus

calculator = MarkerGenesCalculator(
snapshot=snapshot,
all_cell_type_ids_in_corpus=all_cell_types_in_corpus,
groupby_terms=["organism_ontology_term_id", "cell_type_ontology_term_id"],
)
marker_genes = calculator.get_computational_marker_genes()

calculator = MarkerGenesCalculator(
snapshot=snapshot,
all_cell_type_ids_in_corpus=all_cell_types_in_corpus,
groupby_terms=["organism_ontology_term_id", "tissue_ontology_term_id", "cell_type_ontology_term_id"],
)
marker_genes_per_tissue = calculator.get_computational_marker_genes()
Expand Down
4 changes: 1 addition & 3 deletions backend/cellguide/pipeline/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

ASCTB_MASTER_SHEET_URL = "https://ccf-ontology.hubmapconsortium.org/v2.3.0/ccf-asctb-all.json"

ENSEMBL_GENE_ID_TO_DESCRIPTION_FILENAME = "ensembl_gene_ids_to_descriptions.tsv.gz"

HOMO_SAPIENS_ORGANISM_ONTOLOGY_TERM_ID = "NCBITaxon:9606"

# If CELLGUIDE_PIPELINE_NUM_CPUS is not set, use 12 CPUs by default
Expand All @@ -15,4 +13,4 @@

CELL_GUIDE_DATA_BUCKET_PATH_PREFIX = "s3://cellguide-data-public-"

CELL_GUIDE_PINNED_SCHEMA_VERSION = "5.0.0"
CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION = "v5"
6 changes: 3 additions & 3 deletions backend/cellguide/pipeline/gpt_descriptions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import logging

from backend.cellguide.pipeline.constants import CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION
from backend.cellguide.pipeline.gpt_descriptions.gpt_description_generator import (
generate_new_gpt_descriptions,
generate_new_seo_gpt_descriptions,
)
from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder
from backend.cellguide.pipeline.utils import output_json_per_key
from backend.wmg.api.wmg_api_config import WMG_API_SNAPSHOT_SCHEMA_VERSION
from backend.wmg.data.snapshot import load_snapshot
from backend.common.census_cube.data import snapshot as sn

logging.basicConfig(level=logging.INFO)


def run(*, gpt_output_directory: str, gpt_seo_output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
new_gpt_descriptions, new_gpt_seo_descriptions = get_new_gpt_descriptions(ontology_tree=ontology_tree)
output_json_per_key(new_gpt_descriptions, gpt_output_directory)
Expand Down
25 changes: 13 additions & 12 deletions backend/cellguide/pipeline/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
from backend.cellguide.common.constants import CELL_GUIDE_METADATA_FILENAME, CELL_GUIDE_TISSUE_METADATA_FILENAME
from backend.cellguide.pipeline.constants import CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION
from backend.cellguide.pipeline.metadata.metadata_generator import (
generate_cellguide_card_metadata,
generate_cellguide_tissue_card_metadata,
)
from backend.cellguide.pipeline.metadata.types import CellMetadata, TissueMetadata
from backend.cellguide.pipeline.ontology_tree import get_ontology_tree_builder
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder
from backend.cellguide.pipeline.utils import output_json
from backend.wmg.api.wmg_api_config import WMG_API_SNAPSHOT_SCHEMA_VERSION
from backend.wmg.data.snapshot import load_snapshot
from backend.common.census_cube.data import snapshot as sn
from backend.common.census_cube.utils import get_all_cell_type_ids_in_corpus, get_all_tissue_ids_in_corpus


def run(*, output_directory: str):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
ontology_tree = get_ontology_tree_builder(snapshot=snapshot)
cell_metadata = get_cell_metadata(ontology_tree=ontology_tree)
tissue_metadata = get_tissue_metadata(ontology_tree=ontology_tree)
cell_metadata = get_cell_metadata()
tissue_metadata = get_tissue_metadata()

output_json(cell_metadata, f"{output_directory}/{CELL_GUIDE_METADATA_FILENAME}")
output_json(tissue_metadata, f"{output_directory}/{CELL_GUIDE_TISSUE_METADATA_FILENAME}")


def get_cell_metadata(*, ontology_tree: OntologyTreeBuilder) -> dict[str, CellMetadata]:
def get_cell_metadata() -> dict[str, CellMetadata]:
"""
For all cell type ids in the corpus, this pipeline will generate metadata about each cell, including:
- name, ex: "native cell"
Expand All @@ -32,10 +29,12 @@ def get_cell_metadata(*, ontology_tree: OntologyTreeBuilder) -> dict[str, CellMe
Note that we will be filtering out obsolete cell types and invalid non-CL cell types.
"""
return generate_cellguide_card_metadata(ontology_tree.all_cell_type_ids_in_corpus)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)
all_cell_type_ids_in_corpus = get_all_cell_type_ids_in_corpus(snapshot)
return generate_cellguide_card_metadata(all_cell_type_ids_in_corpus)


def get_tissue_metadata(*, ontology_tree: OntologyTreeBuilder) -> dict[str, TissueMetadata]:
def get_tissue_metadata() -> dict[str, TissueMetadata]:
"""
For all tissue ids in the corpus, this pipeline will generate metadata about each tissue, including:
- name, ex: "lung"
Expand All @@ -45,4 +44,6 @@ def get_tissue_metadata(*, ontology_tree: OntologyTreeBuilder) -> dict[str, Tiss
Note that we will be filtering out obsolete tissues.
"""
return generate_cellguide_tissue_card_metadata(ontology_tree.all_tissue_ids_in_corpus)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)
all_tissue_ids_in_corpus = get_all_tissue_ids_in_corpus(snapshot)
return generate_cellguide_tissue_card_metadata(all_tissue_ids_in_corpus)
22 changes: 9 additions & 13 deletions backend/cellguide/pipeline/metadata/metadata_generator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import logging

from cellxgene_ontology_guide.ontology_parser import OntologyParser

from backend.cellguide.pipeline.constants import CELL_GUIDE_PINNED_SCHEMA_VERSION
from backend.cellguide.pipeline.metadata.types import CellMetadata, TissueMetadata
from backend.common.census_cube.utils import ontology_parser

logger = logging.getLogger(__name__)

Expand All @@ -20,7 +18,6 @@ def generate_cellguide_card_metadata(all_cell_type_ids_in_corpus: list[str]) ->
Note that we will be filtering out obsolete cell types and invalid non-CL cell types.
"""
logger.info(f"Generating cellguide card metadata for {len(all_cell_type_ids_in_corpus)} cell types...")
ontology = OntologyParser(schema_version=f"v{CELL_GUIDE_PINNED_SCHEMA_VERSION}")

cellguide_card_metadata: dict[str, CellMetadata] = {}

Expand All @@ -30,20 +27,20 @@ def generate_cellguide_card_metadata(all_cell_type_ids_in_corpus: list[str]) ->

for id in all_cell_type_ids_in_corpus:

if ontology.is_term_deprecated(id):
if ontology_parser.is_term_deprecated(id):
obsolete_cell_ids.append(id)
else:
description = ontology.get_term_description(id)
description = ontology_parser.get_term_description(id)
if description is not None:
cell_ids_with_cl_description += 1
else:
cell_ids_without_cl_description += 1

metadata = CellMetadata(
name=ontology.get_term_label(id),
name=ontology_parser.get_term_label(id),
id=id,
clDescription=description,
synonyms=ontology.get_term_synonyms(id),
synonyms=ontology_parser.get_term_synonyms(id),
)
cellguide_card_metadata[id] = metadata

Expand All @@ -66,7 +63,6 @@ def generate_cellguide_tissue_card_metadata(all_tissue_ids_in_corpus: list[str])
Note that we will be filtering out obsolete tissues.
"""
logger.info(f"Generating cellguide tissue card metadata for {len(all_tissue_ids_in_corpus)} tissues...")
ontology = OntologyParser(schema_version=f"v{CELL_GUIDE_PINNED_SCHEMA_VERSION}")

cellguide_tissue_card_metadata: dict[str, TissueMetadata] = {}

Expand All @@ -75,20 +71,20 @@ def generate_cellguide_tissue_card_metadata(all_tissue_ids_in_corpus: list[str])
uberon_ids_without_description = 0

for id in all_tissue_ids_in_corpus:
if ontology.is_term_deprecated(id):
if ontology_parser.is_term_deprecated(id):
obsolete_uberon_ids.append(id)
else:
description = ontology.get_term_description(id)
description = ontology_parser.get_term_description(id)
if description is not None:
uberon_ids_with_description += 1
else:
uberon_ids_without_description += 1

metadata = TissueMetadata(
name=ontology.get_term_label(id),
name=ontology_parser.get_term_label(id),
id=id,
uberonDescription=description,
synonyms=ontology.get_term_synonyms(id),
synonyms=ontology_parser.get_term_synonyms(id),
)
cellguide_tissue_card_metadata[id] = metadata

Expand Down
12 changes: 6 additions & 6 deletions backend/cellguide/pipeline/ontology_tree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
ONTOLOGY_TREE_STATE_PER_TISSUE_FOLDERNAME,
ONTOLOGY_TREE_TOPLEVEL_FOLDERNAME,
)
from backend.cellguide.pipeline.constants import CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION
from backend.cellguide.pipeline.ontology_tree.tree_builder import OntologyTreeBuilder
from backend.cellguide.pipeline.ontology_tree.types import OntologyTreeData
from backend.cellguide.pipeline.utils import output_json, output_json_per_key
from backend.wmg.api.wmg_api_config import WMG_API_SNAPSHOT_SCHEMA_VERSION
from backend.wmg.data.snapshot import WmgSnapshot, load_snapshot
from backend.common.census_cube.data import snapshot as sn


def run(*, output_directory):
snapshot = load_snapshot(snapshot_schema_version=WMG_API_SNAPSHOT_SCHEMA_VERSION)
snapshot = sn.load_snapshot(snapshot_schema_version=CELLGUIDE_CENSUS_CUBE_DATA_SCHEMA_VERSION)
ontology_tree_data = get_ontology_tree_data(snapshot=snapshot)

for organism in ontology_tree_data:
Expand Down Expand Up @@ -51,18 +51,18 @@ def get_celltype_to_tissue_mapping(all_states_per_tissue):
return celltype_to_tissue_mapping


def get_ontology_tree_builder(*, snapshot: WmgSnapshot) -> OntologyTreeBuilder:
def get_ontology_tree_builder(*, snapshot: sn.CensusCubeSnapshot) -> OntologyTreeBuilder:
cell_counts_df = snapshot.cell_counts_cube.df[:]
return OntologyTreeBuilder(cell_counts_df)


def get_ontology_tree_builder_for_organism(*, snapshot: WmgSnapshot, organism: str) -> OntologyTreeBuilder:
def get_ontology_tree_builder_for_organism(*, snapshot: sn.CensusCubeSnapshot, organism: str) -> OntologyTreeBuilder:
cell_counts_df = snapshot.cell_counts_cube.df[:]
cell_counts_df = cell_counts_df[cell_counts_df["organism_ontology_term_id"] == organism]
return OntologyTreeBuilder(cell_counts_df)


def get_ontology_tree_data(*, snapshot: WmgSnapshot) -> tuple[OntologyTreeBuilder, OntologyTreeData]:
def get_ontology_tree_data(*, snapshot: sn.CensusCubeSnapshot) -> tuple[OntologyTreeBuilder, OntologyTreeData]:
organisms = snapshot.cell_counts_cube.df[:]["organism_ontology_term_id"].unique()
ontology_tree_data = {}
for organism in organisms:
Expand Down
Loading

0 comments on commit 11a520f

Please sign in to comment.