-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCiteSearch.py
276 lines (237 loc) · 9.79 KB
/
CiteSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python
"""Search for CDR Citation documents.
"""
from functools import cached_property
from lxml import etree
import requests
from cdr import prepare_pubmed_article_for_import
from cdrcgi import AdvancedSearch
from cdrapi.docs import Doc
class CitationSearch(AdvancedSearch):
"""Customize search for this document type."""
DOCTYPE = "Citation"
SUBTITLE = DOCTYPE
FILTER = "set:QC Citation Set"
PUBMED = "https://www.ncbi.nlm.nih.gov/entrez/"
MEDLINE_CITATION = "/Citation/PubmedArticle/MedlineCitation"
PUB_DETAILS = "/Citation/PDQCitation/PublicationDetails"
PATHS = {
"title": [
"/Citation/PubmedArticle/%/Article/%Title",
"/Citation/PDQCitation/CitationTitle",
],
"author": [
"/Citation/PDQCitation/AuthorList/Author/%Name",
"/Citation/PubmedArticle/%/AuthorList/Author/%Name",
],
"pub_in": [
f"{MEDLINE_CITATION}/MedlineJournalInfo/MedlineTA",
f"{PUB_DETAILS}/PublishedIn/@cdr:ref[int_val]",
],
"pub_year": [
f"{MEDLINE_CITATION}/Article/Journal/JournalIssue/PubDate/Year",
f"{PUB_DETAILS}/PublicationYear",
],
"volume": [
f"{MEDLINE_CITATION}/Article/Journal/JournalIssue/Volume",
],
"issue": [
f"{MEDLINE_CITATION}/Article/Journal/JournalIssue/Issue",
],
}
# Add some JavaScript to monitor the Import/Update fields.
JS = """\
function chk_cdrid() {
console.log("cdr id is " + jQuery("#cdrid").val());
if (jQuery("#cdrid").val().replace(/\\D/g, "").length === 0)
jQuery("#submit-button-import").val("Import");
else
jQuery("#submit-button-import").val("Update");
}
function chk_pmid() {
console.log("pmid is " + jQuery("#pmid").val().trim());
if (jQuery("#pmid").val().trim().length === 0)
jQuery("#submit-button-import").prop("disabled", true);
else
jQuery("#submit-button-import").prop("disabled", false);
}
$(function() { chk_cdrid(); chk_pmid(); });
"""
def __init__(self):
"""Set the stage for showing the search form or the search results."""
AdvancedSearch.__init__(self)
for name in self.PATHS:
setattr(self, name, self.fields.getvalue(name))
self.search_fields = []
self.query_fields = []
for name, paths in self.PATHS.items():
field = self.QueryField(getattr(self, name), paths)
self.query_fields.append(field)
self.search_fields.append(self.text_field(name))
def run(self):
"""Override the run() method of the base class.
We need to handle requests to import or update PubMed
articles from NLM.
"""
if self.request in ("Import", "Update"):
try:
citation = Citation(self)
citation.save()
self.show_form(citation.message, citation.error)
except Exception as e:
self.session.logger.exception("%s from PubMed", self.request)
error = f"Unable to import {self.pmid!r} from PubMed: {e}"
self.show_form(error=error)
else:
AdvancedSearch.run(self)
@cached_property
def pmid(self):
"""ID of a PubMed article to be imported."""
return self.fields.getvalue("pmid", "").strip()
@cached_property
def cdrid(self):
"""ID of an existing Citation document to be updated."""
cdrid = self.fields.getvalue("cdrid")
return Doc.extract_id(cdrid) if cdrid else None
def add_import_form(self, page):
"""Add another fieldset with fields for importing a PubMed document."""
help = "Optionally enter the CDR ID of a document to be updated."
cdrid_field = self.text_field("cdrid", label="CDR ID", tooltip=help)
cdrid_field.find("input").set("oninput", "chk_cdrid()")
pmid_field = self.text_field("pmid", label="PMID")
pmid_field.find("input").set("oninput", "chk_pmid()")
fieldset = self.fieldset("Import or Update Citation From PubMed")
fieldset.append(pmid_field)
fieldset.append(cdrid_field)
page.form.append(fieldset)
page.head.append(self.B.SCRIPT(self.JS))
def show_form(self, message=None, error=None):
args = self.session.name, self.SUBTITLE, self.search_fields
page = self.Form(*args, control=self)
if self.session.can_do("ADD DOCUMENT", "Citation"):
self.add_import_form(page)
pubmed = f"window.open('{self.PUBMED}', 'pm');"
page.form.append(page.button("Search"))
page.form.append(page.button("Search PubMed", onclick=pubmed))
if self.session.can_do("ADD DOCUMENT", "Citation"):
button = self.button("Import")
onclick = "jQuery('#primary-form').attr('target', '');"
button.set("onclick", onclick)
button.set("disabled")
page.form.append(button)
if message:
type = "warning" if "validation errors" in message else "success"
page.add_alert(message, type=type)
if error:
page.add_alert(error, type="error")
page.send()
class Citation:
"""Logic for assembling and saving a new or updated Citation document."""
EUTILS = "https://eutils.ncbi.nlm.nih.gov"
EFETCH = f"{EUTILS}/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id="
ERRORS = "*** IMPORTED WITH ERRORS *** PUBLISHABLE VERSION NOT CREATED"
VAL_TYPES = "schema", "links"
COMMENT = "Saved from the Citation Advanced Search page"
SAVE_OPTS = dict(
version=True,
publishable=True,
val_types=VAL_TYPES,
unlock=True,
comment=COMMENT,
reason=COMMENT,
)
def __init__(self, control):
"""Save the caller's object referencd.
Most of the work is done while assembling this object's properties.
"""
self.control = control
@cached_property
def error(self):
"""If there were validation errors, log them and show a big warning."""
if not self.doc.errors:
return None
for error in self.doc.errors:
self.control.session.logger.error(str(error))
return self.ERRORS
@cached_property
def message(self):
"""Prepare a subtitle showing what we just did."""
cdrid = self.doc.cdr_id
pmid = self.control.pmid
if self.doc.errors:
suffix = "with validation errors"
else:
suffix = "with a publishable version"
if self.control.cdrid:
return f"Updated {cdrid} from PMID {pmid} ({suffix})"
return f"Imported PMID {pmid} as {cdrid} ({suffix})"
def save(self):
"""Save the new or updated Citation document."""
self.doc.save(**self.SAVE_OPTS)
@cached_property
def doc(self):
"""Prepare a `cdrapi.Doc` object for saving in the CDR"""
# If we're updating an existing Citation doc, fetch and modify it.
if self.control.cdrid:
cdrid = self.control.cdrid
doc = Doc(self.control.session, id=cdrid)
doc.check_out()
root = doc.root
old_node = root.find("PubmedArticle")
if old_node is None:
raise Exception(f"{cdrid} is not a PubMed article")
root.replace(old_node, self.pubmed_article)
doc.xml = etree.tostring(root)
# Otherwise, build up a new document and insert NLM's info.
else:
pmid = self.control.pmid
cdrid = self.lookup(pmid)
if cdrid:
raise Exception(f"PMID {pmid} already imported as {cdrid}")
root = etree.Element("Citation")
details = etree.SubElement(root, "VerificationDetails")
etree.SubElement(details, "Verified").text = "Yes"
etree.SubElement(details, "VerifiedIn").text = "PubMed"
root.append(self.pubmed_article)
opts = dict(xml=etree.tostring(root), doctype="Citation")
doc = Doc(self.control.session, **opts)
# In either case, return the Doc object.
return doc
@cached_property
def pubmed_article(self):
"""Fetch and prepare PubmedArticle element for import into the CDR
Note that we no longer import everything in the documents we get
from NLM, but instead cherry-pick just the information we need,
in order to avoid the whiplash of keeping up with all of their
DTD changes.
"""
pmid = self.control.pmid
url = f"{self.EFETCH}{pmid}"
self.control.session.logger.info("Fetching %r", url)
response = requests.get(url)
root = etree.fromstring(response.content)
node = root.find("PubmedArticle")
if node is None:
args = url, response.content
self.control.session.logger.error("url=%s response=%s", *args)
sleep(2)
response = requests.get(url)
root = etree.fromstring(response.content)
if root is None:
raise Exception(f"PubmedArticle for {pmid} not found")
return prepare_pubmed_article_for_import(node)
def lookup(self, pmid):
"""See if we have already imported this article.
Pass:
pmid - unique string identifier for the PubMed record
Return:
canonical form of the CDR ID for an existing Citation document
(or None if we don't already have it)
"""
query = self.control.DBQuery("query_term", "doc_id")
query.where("path LIKE '/Citation/PubmedArticle/%/PMID'")
query.where(query.Condition("value", pmid))
rows = query.execute(self.control.session.cursor).fetchall()
return f"{rows[0].doc_id:010d}" if rows else None
if __name__ == "__main__":
CitationSearch().run()