|
1 |
| -import gzip |
2 |
| -import json |
3 |
| -import os |
4 |
| -from collections import defaultdict |
5 |
| -from typing import Optional |
6 |
| - |
7 | 1 | from flask import jsonify, make_response
|
8 | 2 |
|
9 |
| -from backend.cellguide.common.config import CellGuideConfig |
10 |
| -from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME |
11 |
| -from backend.cellguide.common.providers.s3_provider import S3Provider |
12 |
| -from backend.cellguide.common.utils import get_object_key |
13 |
| -from backend.common.utils.ontology_parser import ontology_parser |
14 |
| - |
15 |
| - |
16 |
| -def _defaultdict_to_dict(d): |
17 |
| - if isinstance(d, defaultdict): |
18 |
| - # Convert the defaultdict to a dict and recursively apply this function |
19 |
| - return {key: _defaultdict_to_dict(value) for key, value in d.items()} |
20 |
| - else: |
21 |
| - return d |
22 |
| - |
23 |
| - |
24 |
| -def _initialize_cellguide_marker_gene_dict(): |
25 |
| - bucket = CellGuideConfig().bucket |
26 |
| - s3_provider = S3Provider() |
27 |
| - |
28 |
| - latest_snapshot_identifier = ( |
29 |
| - s3_provider.download_file(bucket_name=bucket, object_key=get_object_key(object="latest_snapshot_identifier")) |
30 |
| - .decode("utf-8") |
31 |
| - .strip() |
32 |
| - ) |
33 |
| - compressed_data = s3_provider.download_file( |
34 |
| - bucket_name=bucket, |
35 |
| - object_key=get_object_key( |
36 |
| - object=f"{latest_snapshot_identifier}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}" |
37 |
| - ), |
38 |
| - ) |
39 |
| - marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8")) |
40 |
| - data = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) |
41 |
| - |
42 |
| - for gene in marker_gene_data: |
43 |
| - for organism in marker_gene_data[gene]: |
44 |
| - for tissue in marker_gene_data[gene][organism]: |
45 |
| - for marker in marker_gene_data[gene][organism][tissue]: |
46 |
| - data[organism][tissue][marker["cell_type_id"]].append( |
47 |
| - {"marker_score": marker["marker_score"], "me": marker["me"], "pc": marker["pc"], "gene": gene} |
48 |
| - ) |
49 |
| - |
50 |
| - data = _defaultdict_to_dict(data) |
51 |
| - |
52 |
| - for organism in data: |
53 |
| - for tissue in data[organism]: |
54 |
| - for cell_type in data[organism][tissue]: |
55 |
| - data[organism][tissue][cell_type].sort(key=lambda x: -x["marker_score"]) |
56 |
| - |
57 |
| - return data |
58 |
| - |
59 |
| - |
60 |
| -def is_id(term: str) -> bool: |
61 |
| - return term.startswith("CL:") or term.startswith("UBERON:") or term.startswith("NCBITaxon:") |
| 3 | +from backend.cellguide.api.common.data import get_marker_gene_data |
62 | 4 |
|
63 | 5 |
|
64 |
| -def get(organism: str, tissue: Optional[str] = None, cell_type: Optional[str] = None): |
| 6 | +def get(): |
65 | 7 | """
|
66 |
| - Retrieve marker gene data for a specified organism, and optionally for a specific tissue and/or cell type. |
| 8 | + Retrieve all CellGuide marker gene data. |
67 | 9 |
|
68 |
| - This function handles the retrieval of marker gene data based on the provided organism, tissue, and cell type. |
69 |
| - It uses ontology IDs to resolve the actual names and checks if the provided IDs exist in the marker gene data. |
70 |
| - If the IDs are valid, it returns the corresponding data; otherwise, it returns a 404 response. |
71 |
| -
|
72 |
| - Parameters: |
73 |
| - organism (str): The ontology ID or name of the organism. |
74 |
| - tissue (Optional[str]): The ontology ID or name of the tissue. Default is None. |
75 |
| - cell_type (Optional[str]): The ontology ID or name of the cell type. Default is None. |
| 10 | + This function handles the retrieval of all CellGuide marker gene data. |
76 | 11 |
|
77 | 12 | Returns:
|
78 |
| - Flask Response: JSON data of the marker genes for the requested parameters or a 404 error if not found. |
| 13 | + Flask Response: JSON data of the marker genes. |
| 14 | + The response structure is a nested dictionary with the following structure: |
| 15 | + - organisms --> tissues --> cell types --> list(marker genes). |
| 16 | +
|
| 17 | + Organisms and tissues are labels, cell types are IDs, and marker genes are dictionaries with the following keys: |
| 18 | + - `marker_score`: The score indicating the strength of the marker gene for the cell type. |
| 19 | + - `me`: Mean expression of the gene across the cells of the specified type. |
| 20 | + - `pc`: Percentage of cells within the specified type that express the gene. |
| 21 | + - `gene`: The gene symbol associated with the marker gene data. |
79 | 22 | """
|
80 |
| - marker_gene_data = get_marker_gene_data() |
81 |
| - |
82 |
| - if is_id(organism): |
83 |
| - organism = ontology_parser.get_term_label(organism) |
84 |
| - if tissue and is_id(tissue): |
85 |
| - tissue = ontology_parser.get_term_label(tissue) |
86 |
| - if tissue and cell_type and not is_id(cell_type): |
87 |
| - raise ValueError("cell_type must be an ID") |
88 |
| - |
89 |
| - if not tissue and cell_type or tissue and tissue.lower() == "all tissues": |
90 |
| - tissue = "All Tissues" |
91 |
| - |
92 |
| - if organism not in marker_gene_data: |
93 |
| - return make_response(jsonify({}), 404) |
94 |
| - |
95 |
| - if tissue and tissue not in marker_gene_data[organism]: |
96 |
| - return make_response(jsonify({}), 404) |
97 |
| - |
98 |
| - if tissue and cell_type and cell_type not in marker_gene_data[organism][tissue]: |
99 |
| - return make_response(jsonify({}), 404) |
100 |
| - |
101 |
| - if tissue and cell_type: |
102 |
| - return make_response(jsonify(marker_gene_data[organism][tissue][cell_type]), 200) |
103 |
| - elif tissue: |
104 |
| - return make_response(jsonify(marker_gene_data[organism][tissue]), 200) |
105 |
| - else: |
106 |
| - return make_response(jsonify(marker_gene_data[organism]), 200) |
107 |
| - |
108 |
| - |
109 |
| -_marker_gene_data_cache = None |
110 |
| - |
111 |
| - |
112 |
| -def get_marker_gene_data(): |
113 |
| - global _marker_gene_data_cache |
114 |
| - if _marker_gene_data_cache is None: |
115 |
| - if os.getenv("DEPLOYMENT_STAGE") != "test": |
116 |
| - # Initialize the marker gene data from the latest snapshot only if not in test mode |
117 |
| - _marker_gene_data_cache = _initialize_cellguide_marker_gene_dict() |
118 |
| - else: |
119 |
| - # Initialize an empty structure if in test mode |
120 |
| - _marker_gene_data_cache = {} |
121 |
| - return _marker_gene_data_cache |
| 23 | + return make_response(jsonify(get_marker_gene_data()), 200) |
0 commit comments