Skip to content

Commit 94d0dc1

Browse files
authored
Merge branch 'main' into atar-qol
2 parents 46e0439 + 4d7e6b0 commit 94d0dc1

File tree

4 files changed

+113
-201
lines changed

4 files changed

+113
-201
lines changed

backend/cellguide/api/cellguide-api.yml

Lines changed: 29 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
openapi: 3.0.0
22
info:
33
version: "1.0.0"
4-
title: Chan Zuckerberg Initiative CELLxGENE Discover API
4+
title: Chan Zuckerberg Initiative CELLxGENE CellGuide API
55
termsOfService: https://cellxgene.cziscience.com/tos
66
contact:
77
name: API Support
88
email: cellxgene@chanzuckerberg.com
99
description: |
10-
## Uploads for CellGuide Descriptions
10+
## Overview of Available Endpoints for the CELLxGENE CellGuide API
1111
servers:
1212
- description: Production environment
1313
url: https://api.cellxgene.cziscience.com/
@@ -47,81 +47,46 @@ paths:
4747
$ref: "#/components/responses/401"
4848
"403":
4949
$ref: "#/components/responses/403"
50-
/v1/marker_genes:
51-
get:
52-
summary: Get marker genes for a given organism, tissue (optional), and cell type (optional)
53-
operationId: backend.cellguide.api.v1.marker_genes.actions.get
54-
tags:
55-
- CellGuide
56-
parameters:
57-
- in: query
58-
name: organism
59-
required: true
60-
schema:
61-
$ref: "#/components/schemas/organism_ontology_term_id_or_label"
62-
description: The ontology ID or name of the organism.
63-
- in: query
64-
name: tissue
65-
required: false
66-
schema:
67-
$ref: "#/components/schemas/tissue_ontology_term_id_or_label"
68-
description: The ontology ID or name of the tissue, optional. Use 'All Tissues' to get tissue-agnostic marker genes.
69-
- in: query
70-
name: cell_type
71-
required: false
72-
schema:
73-
$ref: "#/components/schemas/cell_ontology_term_id"
74-
description: The ontology ID or name of the cell type, optional.
75-
responses:
76-
"200":
77-
description: "Successful retrieval of marker genes data."
78-
content:
79-
application/json:
80-
schema:
81-
$ref: "#/components/schemas/marker_gene_data"
82-
"404":
83-
description: "No data found for the specified parameters."
84-
content:
85-
application/json:
86-
schema:
87-
$ref: "#/components/schemas/problem"
50+
# This is commented out until product aligns on the CellGuide API
51+
# Leaving this here for reference
52+
# /v1/marker_genes:
53+
# get:
54+
# summary: |
55+
# Get all CellGuide marker genes.
56+
# operationId: backend.cellguide.api.v1.marker_genes.actions.get
57+
# tags:
58+
# - CellGuide
59+
# responses:
60+
# "200":
61+
# description: "Successful retrieval of marker genes data."
62+
# content:
63+
# application/json:
64+
# schema:
65+
# $ref: "#/components/schemas/marker_gene_data"
66+
# "404":
67+
# description: "No data found for the specified parameters."
68+
# content:
69+
# application/json:
70+
# schema:
71+
# $ref: "#/components/schemas/problem"
8872

8973
components:
9074
schemas:
91-
organism_ontology_term_id_or_label:
92-
description: Organism ontology term ID or label
93-
example: Homo sapiens
94-
type: string
95-
tissue_ontology_term_id_or_label:
96-
description: Tissue ontology term ID or label
97-
example: All Tissues
98-
type: string
99-
cell_ontology_term_id:
100-
description: Cell ontology term ID
101-
example: CL:0000540
102-
type: string
10375
cell_ontology_term_id_fs:
10476
description: File system-compatible cell ontology term ID (with underscores)
10577
example: CL_0000030
10678
type: string
10779
marker_gene_data:
10880
description: Details of a marker gene, which can vary in structure based on the specificity of the query (organism, tissue, cell type). Can be a nested structure or a direct list.
109-
oneOf:
110-
- type: object
111-
additionalProperties: # This allows any property name for tissue
112-
type: object
113-
additionalProperties: # This allows any property name for cell type
114-
type: array
115-
items:
116-
$ref: "#/components/schemas/marker_gene_entry"
117-
- type: object
81+
type: object
82+
additionalProperties: # This allows any property name for organism
83+
type: object
84+
additionalProperties: # This allows any property name for tissue
85+
type: object
11886
additionalProperties: # This allows any property name for cell type
11987
type: array
12088
items:
12189
$ref: "#/components/schemas/marker_gene_entry"
122-
- type: array
123-
items:
124-
$ref: "#/components/schemas/marker_gene_entry"
12590
marker_gene_entry:
12691
type: object
12792
properties:

backend/cellguide/api/common/data.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import gzip
2+
import json
3+
import os
4+
from collections import defaultdict
5+
6+
from backend.cellguide.common.config import CellGuideConfig
7+
from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
8+
from backend.cellguide.common.providers.s3_provider import S3Provider
9+
from backend.cellguide.common.utils import get_object_key
10+
11+
12+
def _defaultdict_to_dict(d):
13+
if isinstance(d, defaultdict):
14+
# Convert the defaultdict to a dict and recursively apply this function
15+
return {key: _defaultdict_to_dict(value) for key, value in d.items()}
16+
else:
17+
return d
18+
19+
20+
def _initialize_cellguide_marker_gene_dict():
21+
bucket = CellGuideConfig().bucket
22+
s3_provider = S3Provider()
23+
24+
latest_snapshot_identifier = (
25+
s3_provider.download_file(bucket_name=bucket, object_key=get_object_key(object="latest_snapshot_identifier"))
26+
.decode("utf-8")
27+
.strip()
28+
)
29+
compressed_data = s3_provider.download_file(
30+
bucket_name=bucket,
31+
object_key=get_object_key(
32+
object=f"{latest_snapshot_identifier}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}"
33+
),
34+
)
35+
marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8"))
36+
data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
37+
38+
for gene in marker_gene_data:
39+
for organism in marker_gene_data[gene]:
40+
for tissue in marker_gene_data[gene][organism]:
41+
for marker in marker_gene_data[gene][organism][tissue]:
42+
data[organism][tissue][marker["cell_type_id"]].append(
43+
{"marker_score": marker["marker_score"], "me": marker["me"], "pc": marker["pc"], "gene": gene}
44+
)
45+
46+
data = _defaultdict_to_dict(data)
47+
48+
for organism in data:
49+
for tissue in data[organism]:
50+
for cell_type in data[organism][tissue]:
51+
data[organism][tissue][cell_type].sort(key=lambda x: -x["marker_score"])
52+
53+
return data
54+
55+
56+
_marker_gene_data_cache = None
57+
58+
59+
def get_marker_gene_data():
60+
global _marker_gene_data_cache
61+
if _marker_gene_data_cache is None:
62+
if os.getenv("DEPLOYMENT_STAGE") != "test":
63+
# Initialize the marker gene data from the latest snapshot only if not in test mode
64+
_marker_gene_data_cache = _initialize_cellguide_marker_gene_dict()
65+
else:
66+
# Initialize an empty structure if in test mode
67+
_marker_gene_data_cache = {}
68+
return _marker_gene_data_cache
Lines changed: 14 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,23 @@
1-
import gzip
2-
import json
3-
import os
4-
from collections import defaultdict
5-
from typing import Optional
6-
71
from flask import jsonify, make_response
82

9-
from backend.cellguide.common.config import CellGuideConfig
10-
from backend.cellguide.common.constants import COMPUTATIONAL_MARKER_GENES_FOLDERNAME, MARKER_GENE_PRESENCE_FILENAME
11-
from backend.cellguide.common.providers.s3_provider import S3Provider
12-
from backend.cellguide.common.utils import get_object_key
13-
from backend.common.utils.ontology_parser import ontology_parser
14-
15-
16-
def _defaultdict_to_dict(d):
17-
if isinstance(d, defaultdict):
18-
# Convert the defaultdict to a dict and recursively apply this function
19-
return {key: _defaultdict_to_dict(value) for key, value in d.items()}
20-
else:
21-
return d
22-
23-
24-
def _initialize_cellguide_marker_gene_dict():
25-
bucket = CellGuideConfig().bucket
26-
s3_provider = S3Provider()
27-
28-
latest_snapshot_identifier = (
29-
s3_provider.download_file(bucket_name=bucket, object_key=get_object_key(object="latest_snapshot_identifier"))
30-
.decode("utf-8")
31-
.strip()
32-
)
33-
compressed_data = s3_provider.download_file(
34-
bucket_name=bucket,
35-
object_key=get_object_key(
36-
object=f"{latest_snapshot_identifier}/{COMPUTATIONAL_MARKER_GENES_FOLDERNAME}/{MARKER_GENE_PRESENCE_FILENAME}"
37-
),
38-
)
39-
marker_gene_data = json.loads(gzip.decompress(compressed_data).decode("utf-8"))
40-
data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
41-
42-
for gene in marker_gene_data:
43-
for organism in marker_gene_data[gene]:
44-
for tissue in marker_gene_data[gene][organism]:
45-
for marker in marker_gene_data[gene][organism][tissue]:
46-
data[organism][tissue][marker["cell_type_id"]].append(
47-
{"marker_score": marker["marker_score"], "me": marker["me"], "pc": marker["pc"], "gene": gene}
48-
)
49-
50-
data = _defaultdict_to_dict(data)
51-
52-
for organism in data:
53-
for tissue in data[organism]:
54-
for cell_type in data[organism][tissue]:
55-
data[organism][tissue][cell_type].sort(key=lambda x: -x["marker_score"])
56-
57-
return data
58-
59-
60-
def is_id(term: str) -> bool:
61-
return term.startswith("CL:") or term.startswith("UBERON:") or term.startswith("NCBITaxon:")
3+
from backend.cellguide.api.common.data import get_marker_gene_data
624

635

64-
def get(organism: str, tissue: Optional[str] = None, cell_type: Optional[str] = None):
6+
def get():
657
"""
66-
Retrieve marker gene data for a specified organism, and optionally for a specific tissue and/or cell type.
8+
Retrieve all CellGuide marker gene data.
679
68-
This function handles the retrieval of marker gene data based on the provided organism, tissue, and cell type.
69-
It uses ontology IDs to resolve the actual names and checks if the provided IDs exist in the marker gene data.
70-
If the IDs are valid, it returns the corresponding data; otherwise, it returns a 404 response.
71-
72-
Parameters:
73-
organism (str): The ontology ID or name of the organism.
74-
tissue (Optional[str]): The ontology ID or name of the tissue. Default is None.
75-
cell_type (Optional[str]): The ontology ID or name of the cell type. Default is None.
10+
This function handles the retrieval of all CellGuide marker gene data.
7611
7712
Returns:
78-
Flask Response: JSON data of the marker genes for the requested parameters or a 404 error if not found.
13+
Flask Response: JSON data of the marker genes.
14+
The response structure is a nested dictionary with the following structure:
15+
- organisms --> tissues --> cell types --> list(marker genes).
16+
17+
Organisms and tissues are labels, cell types are IDs, and marker genes are dictionaries with the following keys:
18+
- `marker_score`: The score indicating the strength of the marker gene for the cell type.
19+
- `me`: Mean expression of the gene across the cells of the specified type.
20+
- `pc`: Percentage of cells within the specified type that express the gene.
21+
- `gene`: The gene symbol associated with the marker gene data.
7922
"""
80-
marker_gene_data = get_marker_gene_data()
81-
82-
if is_id(organism):
83-
organism = ontology_parser.get_term_label(organism)
84-
if tissue and is_id(tissue):
85-
tissue = ontology_parser.get_term_label(tissue)
86-
if tissue and cell_type and not is_id(cell_type):
87-
raise ValueError("cell_type must be an ID")
88-
89-
if not tissue and cell_type or tissue and tissue.lower() == "all tissues":
90-
tissue = "All Tissues"
91-
92-
if organism not in marker_gene_data:
93-
return make_response(jsonify({}), 404)
94-
95-
if tissue and tissue not in marker_gene_data[organism]:
96-
return make_response(jsonify({}), 404)
97-
98-
if tissue and cell_type and cell_type not in marker_gene_data[organism][tissue]:
99-
return make_response(jsonify({}), 404)
100-
101-
if tissue and cell_type:
102-
return make_response(jsonify(marker_gene_data[organism][tissue][cell_type]), 200)
103-
elif tissue:
104-
return make_response(jsonify(marker_gene_data[organism][tissue]), 200)
105-
else:
106-
return make_response(jsonify(marker_gene_data[organism]), 200)
107-
108-
109-
_marker_gene_data_cache = None
110-
111-
112-
def get_marker_gene_data():
113-
global _marker_gene_data_cache
114-
if _marker_gene_data_cache is None:
115-
if os.getenv("DEPLOYMENT_STAGE") != "test":
116-
# Initialize the marker gene data from the latest snapshot only if not in test mode
117-
_marker_gene_data_cache = _initialize_cellguide_marker_gene_dict()
118-
else:
119-
# Initialize an empty structure if in test mode
120-
_marker_gene_data_cache = {}
121-
return _marker_gene_data_cache
23+
return make_response(jsonify(get_marker_gene_data()), 200)

tests/unit/backend/cellguide/api/test_marker_genes.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,7 @@ def setUp(self):
1515
)
1616
def test_get_marker_genes(self):
1717
# Test case: Organism, tissue, and cell type provided
18-
response = self.app.get(
19-
"/cellguide/v1/marker_genes?organism=NCBITaxon:9606&tissue=UBERON:0000955&cell_type=CL:0000540"
20-
)
18+
response = self.app.get("/cellguide/v1/marker_genes")
2119
self.assertEqual(response.status_code, 200)
2220
data = response.get_json()
23-
self.assertEqual(data, self.mock_marker_gene_data["Homo sapiens"]["brain"]["CL:0000540"])
24-
25-
# Test case: Only organism is provided
26-
response = self.app.get("/cellguide/v1/marker_genes?organism=NCBITaxon:9606")
27-
self.assertEqual(response.status_code, 200)
28-
data = response.get_json()
29-
expected_data = self.mock_marker_gene_data["Homo sapiens"]
30-
self.assertEqual(data, expected_data)
31-
32-
# Test case: Organism and tissue is provided
33-
response = self.app.get("/cellguide/v1/marker_genes?organism=NCBITaxon:9606&tissue=brain")
34-
self.assertEqual(response.status_code, 200)
35-
data = response.get_json()
36-
expected_data = self.mock_marker_gene_data["Homo sapiens"]["brain"]
37-
self.assertEqual(data, expected_data)
38-
39-
# Test case: Organism and cell type is provided, agnostic to tissues
40-
response = self.app.get("/cellguide/v1/marker_genes?organism=NCBITaxon:9606&cell_type=CL:0000540")
41-
self.assertEqual(response.status_code, 200)
42-
data = response.get_json()
43-
expected_data = self.mock_marker_gene_data["Homo sapiens"]["All Tissues"]["CL:0000540"]
44-
self.assertEqual(data, expected_data)
21+
self.assertEqual(data, self.mock_marker_gene_data)

0 commit comments

Comments
 (0)