Skip to content

Commit 9ccbf0e

Browse files
authored
Merge pull request #131 from ranking-agent/refactor
Refactor. This cleans out a lot of stuff and makes a functional version of MCQ along with a partial version of EDGAR.
2 parents 82578f1 + f5d96a8 commit 9ccbf0e

31 files changed

+657502
-1152975
lines changed

documentation/MultiCurieQueries.ipynb

Lines changed: 755 additions & 0 deletions
Large diffs are not rendered by default.

main.sh

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
# gunicorn --bind 0.0.0.0:6380 -w 1 -k uvicorn.workers.UvicornWorker -t 600 src.server:APP --root-path /1.3
66

77
#uvicorn --host 0.0.0.0 --port 6380 --workers 1 src.server:APP --root-path /1.3
8-
uvicorn --host 0.0.0.0 --port 6380 --workers 1 src.server:APP --root-path /1.4
8+
uvicorn --host 0.0.0.0 --port 6380 --workers 1 src.server:APP --root-path /

openapi-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ x-translator:
2323
- Ranking Agent
2424
infores: "infores:answer-coalesce"
2525
x-trapi:
26-
version: 1.4.0
26+
version: 1.5.0
2727
operations:
2828
- coalesce answers
2929
- coalesce by ontology

requirements.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
bmt==1.2.1
2-
Jinja2~=2.11.2
3-
jsonschema~=3.2.0
4-
neo4j~=4.1.1
2+
Jinja2~=3.1.4
3+
jsonschema>=4.18.0
54
pyyaml~=6.0
65
argparse~=1.4.0
76
setuptools~=50.3.2
@@ -12,11 +11,12 @@ sparqlwrapper~=1.8.5
1211
redis~=5.0.3
1312
fastapi==0.103.1
1413
sanic~=23.6.0
15-
httpx~=0.24.1
14+
httpx>=0.25.0
1615
gunicorn==21.2.0
1716
numpy
1817
pydantic==1.10.10
1918
uvicorn==0.23.2
2019
reasoner-pydantic==5.0.2
2120
jsonlines
2221
orjson
22+
pytest-asyncio

src/components.py

Lines changed: 202 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,218 @@
11
from copy import deepcopy
22
from collections import defaultdict
3-
import ast, json
3+
import ast, json, uuid
44
from string import Template
55

6-
class Opportunity:
7-
def __init__(self, hash, qg, kg, a_i, ai2kg):
8-
"""
9-
Define a coalescent opportunity by a hash (the fixed parts of the answers)
10-
qg: a (qg_id, semantic type) pair
11-
kg: kg_ids allowed for the combined answers
12-
a_i: indices into the results saying which answers are combined to give this
13-
opportunity
14-
kg2a_i: dict mapping frozenset of kg -> which answer they appear in. Used to filter.
15-
"""
16-
self.answer_hash = hash
17-
self.qg_id = qg[0]
18-
self.qg_semantic_type = qg[1]
19-
self.kg_ids = kg
20-
self.answer_indices = a_i
21-
self.answerid2kg = ai2kg
22-
def get_kg_ids(self):
23-
return self.kg_ids
24-
def get_qg_id(self):
25-
return self.qg_id
26-
def get_qg_semantic_type(self):
27-
stype = self.qg_semantic_type
28-
if isinstance(stype, list):
29-
stype = stype[0]
30-
return stype
31-
def get_answer_indices(self):
32-
return self.answer_indices
33-
def filter(self,new_kg_ids):
34-
"""We constructed the opportunities without regard to what nodes we have data on. Now, we want to filter
35-
this opportunity to only answers where we actually have info about the nodes.
36-
new_kg_ids is a subset of self.kg_ids. If we have an answer where we don't have all the nodes, we want to
37-
get rid of that answer, and return a new (filtered) opp. If we get rid of all answers, return None"""
38-
if len(new_kg_ids) == len(self.kg_ids):
39-
#No filtering required
40-
return self
41-
nkgids = set(new_kg_ids)
42-
new_ai2kg = {}
43-
for answer_i, kgs in self.answerid2kg.items():
44-
keep = True
45-
for kg_i in kgs:
46-
if kg_i not in nkgids:
47-
keep = False
48-
break
49-
if keep:
50-
new_ai2kg[answer_i] = kgs
51-
if len(new_ai2kg) == 0:
52-
return None
53-
#If we removed any answers, we might also need to remove some kg_ids that weren't explicitly filtered
54-
final_kg_ids = set()
55-
for kgi in new_ai2kg.values():
56-
final_kg_ids.update(kgi)
57-
return Opportunity(self.answer_hash, (self.qg_id , self.qg_semantic_type ), list(final_kg_ids), list(new_ai2kg.keys()), new_ai2kg)
6+
###
7+
# These classes are used to extract the meaning from the TRAPI MCQ query into a more usable form
8+
###
9+
10+
# TODO: Handle the case where we are not gifted a category for the group node
11+
class MCQGroupNode:
12+
def __init__(self, query_graph):
13+
for qnode_id, qnode in query_graph["nodes"].items():
14+
if qnode.get("set_interpretation", "") == "MANY":
15+
self.curies = qnode["member_ids"]
16+
self.qnode_id = qnode_id
17+
self.uuid = qnode["ids"][0]
18+
self.semantic_type = qnode["categories"][0]
19+
20+
class MCQEnrichedNode:
21+
def __init__(self, query_graph):
22+
for qnode_id, qnode in query_graph["nodes"].items():
23+
if qnode.get("set_interpretation", "") != "MANY":
24+
self.qnode_id = qnode_id
25+
self.semantic_types = qnode["categories"]
26+
27+
class MCQEdge:
28+
def __init__(self, query_graph,groupnode_qnodeid):
29+
for qedge_id, qedge in query_graph["edges"].items():
30+
if qedge["subject"] == groupnode_qnodeid:
31+
self.group_is_subject = True
32+
else:
33+
self.group_is_subject = False
34+
self.qedge_id = qedge_id
35+
self.predicate_only = qedge.get("predicates",["biolink:related_to"])[0]
36+
self.predicate = {"predicate": self.predicate_only }
37+
self.qualifiers = []
38+
qualifier_constraints = qedge.get("qualifiers_constraints", [])
39+
if len(qualifier_constraints) > 0:
40+
qc = qualifier_constraints[0]
41+
self.qualifiers = qc.get("qualifier_set", [])
42+
for q in self.qualifiers:
43+
self.predicate[q["qualifier_type_id"]] = q["qualifier_value"]
44+
45+
class MCQDefinition:
46+
def __init__(self,in_message):
47+
query_graph = in_message["message"]["query_graph"]
48+
self.group_node = MCQGroupNode(query_graph)
49+
self.enriched_node = MCQEnrichedNode(query_graph)
50+
self.edge = MCQEdge(query_graph,self.group_node.qnode_id)
51+
52+
53+
###
54+
# These components are about holding the results of Graph enrichment in a TRAPI independent way
55+
###
5856

5957
class NewNode:
60-
def __init__(self,newnode, newnodetype, edge_pred_and_qual, newnode_is, newnode_name):
61-
self.newnode = newnode
58+
def __init__(self, newnode, newnodetype: list[str]): #edge_pred_and_qual, newnode_is):
59+
self.new_curie = newnode
6260
self.newnode_type = newnodetype
63-
self.new_edges = edge_pred_and_qual
64-
self.newnode_is = newnode_is
65-
self.newnode_name = newnode_name
61+
self.newnode_name = None
62+
63+
class NewEdge:
64+
def __init__(self, source, predicate: str, target):
65+
self.source = source
66+
self.predicate = predicate
67+
self.target = target
68+
def get_prov_link(self):
69+
return f"{self.source} {self.predicate} {self.target}"
70+
71+
def get_sym_prov_link(self):
72+
return f"{self.target} {self.predicate} {self.source}"
73+
74+
def add_prov(self,prov):
75+
self.prov = prov
76+
77+
class Lookup_params:
78+
def __init__(self, in_message):
79+
for qedge_id, qedges in in_message.get("message", {}).get("query_graph", {}).get("edges", {}).items():
80+
subject = in_message.get("message", {}).get("query_graph", {}).get("nodes", {})[qedges["subject"]]
81+
object = in_message.get("message", {}).get("query_graph", {}).get("nodes", {})[qedges["object"]]
82+
if subject.get("ids", []):
83+
is_source = True
84+
else:
85+
is_source = False
86+
if is_source:
87+
curies = subject["ids"][0]
88+
input_qnode = qedges["subject"]
89+
output_qnode = qedges["object"]
90+
semantic_type = object.get("categories", [])[0]
91+
else:
92+
curie = object["ids"][0]
93+
input_qnode = qedges["object"]
94+
output_qnode = qedges["subject"]
95+
semantic_type = subject.get("categories", [])[0]
96+
predicate_parts = {"predicate": qedges["predicates"][0]}
97+
qualifier_constraints = qedges.get("qualifier_constraints", [])
98+
if len(qualifier_constraints) > 0:
99+
qc = qualifier_constraints[0]
100+
qs = qc.get("qualifier_set", [])
101+
for q in qs:
102+
predicate_parts[q["qualifier_type_id"].split(":")[1]] = q["qualifier_value"]
103+
predicate_parts = json.dumps(predicate_parts, sort_keys=True)
104+
self.is_source = is_source
105+
self.curie = curie
106+
self.predicate_parts = predicate_parts
107+
self.input_qnode = input_qnode
108+
self.output_qnode = output_qnode
109+
self.output_semantic_type = semantic_type
110+
self.qedge_id = qedge_id
111+
112+
class Lookup:
113+
def __init__(self, curie, predicate, is_source, node_names, node_types, lookup_ids, params_output_semantic_type = []):
114+
self.predicate = predicate
115+
self.is_source = is_source
116+
self.link_ids = lookup_ids
117+
118+
self.add_input_node(curie, node_types)
119+
self.add_input_node_name(node_names)
120+
self.add_links(node_names, node_types)
121+
self.add_linked_edges(curie, is_source)
122+
123+
124+
def add_input_node(self, curie, node_types):
125+
"""Optionally, we can patch by adding a new node, which will share a relationship of
126+
some sort to the curies in self.set_curies. The remaining parameters give the edge_type
127+
of those edges, as well as defining whether the edge points to the newnode (newnode_is = 'target')
128+
or away from it (newnode_is = 'source') """
129+
self.input_qnode_curie = NewNode(curie, node_types.get(curie, None))
130+
def add_input_node_name(self, node_names):
131+
self.input_qnode_curie.name = node_names.get(self.input_qnode_curie.new_curie, None)
132+
133+
def add_links(self, nodenames, nodetypes):
134+
self.lookup_links = [Lookup_Links(link_id, nodenames.get(link_id), nodetypes.get(link_id)) for link_id in self.link_ids]
135+
def add_linked_edges(self, input_node, input_node_is_source):
136+
"""Add edges between the newnode (curie) and the curies that they were linked to"""
137+
if input_node_is_source:
138+
for i, new_ids in enumerate(self.link_ids):
139+
self.lookup_links[i].link_edge = NewEdge(input_node, self.predicate, new_ids)
140+
else:
141+
for i, new_ids in enumerate(self.link_ids):
142+
self.lookup_links[i].link_edge = NewEdge(new_ids, self.predicate, input_node)
143+
def add_linked_kg_edges_id(self, eid):
144+
"""Add edges between the newnode (curie) and the curies that they were linked to as written in the KG"""
145+
self.link_kg_edges_ids.append(eid)
146+
def get_prov_links(self):
147+
return [link.link_edge.get_prov_link() for link in self.lookup_links]
148+
149+
def add_provenance(self, prov):
150+
for link in self.lookup_links:
151+
if prov.get(link.link_edge.get_prov_link()):
152+
link.link_edge.add_prov(prov[link.link_edge.get_prov_link()])
153+
else:
154+
link.link_edge.add_prov(prov[link.link_edge.get_sym_prov_link()])
66155

67-
class PropertyPatch:
68-
def __init__(self,qg_id,curies,props,answer_ids):
69-
self.qg_id = qg_id
70-
self.set_curies = curies
71-
self.new_props = props
72-
self.answer_indices = answer_ids
73-
self.added_nodes = []
156+
def add_enrichment(self, lookup_indices, enriched_node, predicate, is_source, pvalue):
157+
for index in lookup_indices:
158+
if hasattr(self.lookup_links[index], 'enrichments'):
159+
self.lookup_links[index].enrichments.append(Link_enrichment(enriched_node, predicate, is_source, pvalue))
160+
else:
161+
self.lookup_links[index].enrichments= [Link_enrichment(enriched_node, predicate, is_source, pvalue)]
162+
163+
class Lookup_Links:
164+
def __init__(self, link_id, link_name, link_type):
165+
self.link_id = link_id
166+
self.link_name = link_name
167+
self.link_type = link_type
168+
169+
class Link_enrichment:
170+
def __init__(self, enriched_node, predicate, is_source,pvalue):
171+
self.enriched_node = enriched_node
172+
self.predicate = predicate
173+
self.is_source = is_source
174+
self.p_value = pvalue
175+
176+
class Enrichment:
177+
def __init__(self,p_value,newnode: str, predicate: str, is_source, ndraws, n, total_node_count, curies, node_type: list[str]):
178+
"""Here the curies are the curies that actually link to newnode, not just the input curies."""
179+
self.p_value = p_value
180+
self.linked_curies = curies
181+
self.enriched_node = None
182+
self.predicate = predicate
183+
self.is_source = is_source
74184
self.provmap = {}
75-
def add_provenance(self,provmap):
76-
self.provmap = provmap
77-
def add_extra_node(self,newnode, newnodetype, edge_pred_and_qual, newnode_is,newnode_name):
185+
self.add_extra_node(newnode, node_type)
186+
self.add_extra_edges(newnode, predicate, is_source)
187+
self.counts = [ndraws, n, total_node_count]
188+
def add_extra_node(self,newnode, newnodetype: list[str]):
78189
"""Optionally, we can patch by adding a new node, which will share a relationship of
79190
some sort to the curies in self.set_curies. The remaining parameters give the edge_type
80191
of those edges, as well as defining whether the edge points to the newnode (newnode_is = 'target')
81192
or away from it (newnode_is = 'source') """
82-
self.added_nodes.append( NewNode(newnode, newnodetype, edge_pred_and_qual, newnode_is, newnode_name) )
83-
def apply(self,answers,question,graph,graph_index,patch_no):
193+
self.enriched_node = NewNode(newnode, newnodetype)
194+
def add_extra_node_name_and_label(self,name_dict,label_dict):
195+
self.enriched_node.newnode_name = name_dict.get(self.enriched_node.new_curie, None)
196+
self.enriched_node.newnode_type = label_dict.get(self.enriched_node.new_curie, [])
197+
def add_extra_edges(self, newnode, predicate: str, newnode_is_source):
198+
"""Add edges between the newnode (curie) and the curies that they were linked to"""
199+
if newnode_is_source:
200+
self.links = [NewEdge(newnode,predicate,curie) for curie in self.linked_curies]
201+
else:
202+
self.links = [NewEdge(curie,predicate,newnode) for curie in self.linked_curies]
203+
def get_prov_links(self):
204+
return [link.get_prov_link() for link in self.links]
205+
def add_provenance(self,prov):
206+
for link in self.links:
207+
provlink = link.get_prov_link()
208+
symprovlink = link.get_sym_prov_link()
209+
if prov.get(provlink):
210+
link.add_prov(prov[provlink])
211+
else:
212+
link.add_prov(prov[symprovlink])
213+
214+
#TODO: this should not exist in here any more, we are just making a data class
215+
def x_apply(self,answers,question,graph,graph_index,patch_no):
84216
# Find the answers to combine. It's not necessarily the answer_ids. Those were the
85217
# answers that were originally in the opportunity, but we might have only found commonality
86218
# among a subset of them

src/graph_coalescence/build_redis_files.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,7 @@ def get_filter_nodes():
4848
that we don't have anyway, but also a bunch of very generic terms (Disease, Human) that are not useful
4949
and make lots of links in our database."""
5050

51-
#At the moment, we are going to get the blocklist from my PR but this can very soon be replaced with
52-
# the real one
53-
blocklist_url = "https://raw.githubusercontent.com/cbizon/Relay/master/config/blocklist.json"
54-
#blocklist_url = "https://raw.githubusercontent.com/NCATSTranslator/Relay/master/config/blocklist.json"
51+
blocklist_url = "https://raw.githubusercontent.com/NCATSTranslator/Relay/master/config/blocklist.json"
5552

5653
blocklist = json.loads(requests.get(blocklist_url).text)
5754
return set(blocklist)

0 commit comments

Comments
 (0)