Skip to content

Commit

Permalink
Merge pull request #1416 from kkaris/journal-info
Browse files Browse the repository at this point in the history
Journal info
  • Loading branch information
bgyori authored Aug 30, 2023
2 parents 195dc43 + 7877aa1 commit 120e861
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 19 deletions.
2 changes: 1 addition & 1 deletion indra/databases/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
'SDIS', 'SCHEM', 'SFAM', 'SCOMP', 'HMS-LINCS', 'NXPFA',
'OMIM', 'LSPCI', 'UPLOC', 'BFO', 'CCLE', 'CLO', 'GENBANK', 'CALOHA',
'DRUGBANK.SALT', 'SMILES', 'NIHREPORTER.PROJECT', 'GOOGLE.PATENT', 'SPINE',
'VO', 'EMAPA', 'INO', 'CIDO', 'OAE', 'OHPI', 'PHIPO'
'VO', 'EMAPA', 'INO', 'CIDO', 'OAE', 'OHPI', 'PHIPO', 'NLM', 'ISNI',
}

# These are reverse mappings from identifiers.org namespaces to INDRA
Expand Down
159 changes: 141 additions & 18 deletions indra/literature/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,37 +307,157 @@ def _find_elem_text(root, xpath_string):
return None if elem is None else elem.text


def _get_journal_info(medline_citation, get_issns_from_nlm):
def _get_issue_info(journal: ET.Element):
# Issue info
issue = journal.find('JournalIssue')
issue_volume = _find_elem_text(issue, 'Volume')
issue_issue = _find_elem_text(issue, 'Issue')

issue_pub_date = issue.find('PubDate')
if issue_pub_date is not None:
# Get issue year
issue_year = _find_elem_text(issue_pub_date, 'Year')
issue_year = int(issue_year) if issue_year else None

else:
issue_year = None

return {
"volume": issue_volume,
"issue": issue_issue,
"year": issue_year
}


def get_issn_info(
medline_citation: ET.Element,
get_issns_from_nlm: str = "never"
):
"""Given a medline citation, get the issn info from the article
Parameters
----------
medline_citation : xml.etree.ElementTree.Element
The MedlineCitation element of the PubMed XML tree.
get_issns_from_nlm : Literal['never', 'missing', 'always']
Whether to recover ISSN values from the NLM catalog. Options are
'never', 'missing', and 'always'. If 'missing', then the ISSN
values will be recovered from the NLM catalog if they are not found
in the XML. If 'always', then the ISSN values will be recovered from
the NLM catalog regardless of whether they are found in the XML.
Default is 'never' (i.e., never recover from NLM catalog regardless
of whether they are found in the XML).
Returns
-------
dict
A dictionary journal, issue, and ISSN info. The structure is as
follows:
{
"journal_title": str,
"journal_abbrev": str,
"journal_nlm_id": str,
"issn_dict": {
"issn": str,
"issn_l": str,
"type": "print"|"electronic"|"other",
},
"issue_dict": {
"volume": str,
"issue": str,
"year": int
}
}
"""
if get_issns_from_nlm not in ['never', 'missing', 'always']:
raise ValueError("get_issns_from_nlm must be one of 'never', "
"'missing', or 'always'")
# Journal info
journal = medline_citation.find('Article/Journal')
journal_title = _find_elem_text(journal, 'Title')
journal_abbrev = _find_elem_text(journal, 'ISOAbbreviation')

# Issue info
issue_info = _get_issue_info(journal)

# Get the ISSN from the article record
issn_dict = {}
issn_element = journal.find("ISSN")
if issn_element is not None:
issn_type = issn_element.attrib.get("IssnType", "other").lower()
issn = issn_element.text
issn_dict["issn"] = issn
issn_dict["type"] = issn_type

# Get the linking ISSN from the article record
issn_linking = _find_elem_text(medline_citation,
"MedlineJournalInfo/ISSNLinking")
if issn_linking:
issn_dict["issn_l"] = issn_linking

nlm_id = _find_elem_text(medline_citation,
'MedlineJournalInfo/NlmUniqueID')

# Get ISSN values from the NLM catalog
if nlm_id and (
get_issns_from_nlm == 'always' or
get_issns_from_nlm == 'missing' and not any(issn_dict.values())
):
nlm_issn_list = get_issns_for_journal(nlm_id)
if nlm_issn_list:
issn_dict['alternate_issns'] = nlm_issn_list

return {
"journal_title": journal_title,
"journal_abbrev": journal_abbrev,
"journal_nlm_id": nlm_id,
"issn_dict": issn_dict,
"issue_dict": issue_info,
}


def _get_journal_info(medline_citation, get_issns_from_nlm: bool):
# Journal info
journal = medline_citation.find('Article/Journal')
journal_title = _find_elem_text(journal, 'Title')
journal_abbrev = _find_elem_text(journal, 'ISOAbbreviation')

# Issue info
issue_info = _get_issue_info(journal)

# Add the ISSN from the article record
issn_list = []
issn_set = set()
issn = _find_elem_text(journal, 'ISSN')
if issn:
issn_list.append(issn)
issn_set.add(issn)

# Add the Linking ISSN from the article record
issn_linking = _find_elem_text(medline_citation,
'MedlineJournalInfo/ISSNLinking')
if issn_linking:
issn_list.append(issn_linking)
issn_set.add(issn_linking)

# Now get the list of ISSNs from the NLM Catalog
nlm_id = _find_elem_text(medline_citation,
'MedlineJournalInfo/NlmUniqueID')
if nlm_id and get_issns_from_nlm:
nlm_issn_list = get_issns_for_journal(nlm_id)
if nlm_issn_list:
issn_list += nlm_issn_list
issn_set.update(v for _, v in nlm_issn_list)

# Remove any duplicate issns
issn_list = list(set(issn_list))

return {'journal_title': journal_title, 'journal_abbrev': journal_abbrev,
'issn_list': issn_list, 'journal_nlm_id': nlm_id}
issn_list = list(issn_set)

return {
'journal_title': journal_title,
'journal_abbrev': journal_abbrev,
'issn_list': issn_list,
'issn_l': issn_linking,
'journal_nlm_id': nlm_id,
'issue': issue_info['issue'],
'volume': issue_info['volume'],
'year': issue_info['year'],
}


def _get_pubmed_publication_date(pubmed_data):
Expand Down Expand Up @@ -498,12 +618,13 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
dict of dicts
Dictionary indexed by PMID. Each value is a dict containing the
following fields: 'doi', 'title', 'authors', 'journal_title',
'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page'.
'journal_abbrev', 'journal_nlm_id', 'issn_list', 'page',
'volume', 'issue', 'issue_pub_date'.
"""
# Iterate over the articles and build the results dict
results = {}
pm_articles = tree.findall('./PubmedArticle')
for art_ix, pm_article in enumerate(pm_articles):
for pm_article in pm_articles:
medline_citation = pm_article.find('./MedlineCitation')
pubmed_data = pm_article.find('PubmedData')

Expand Down Expand Up @@ -651,7 +772,7 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,

@lru_cache(maxsize=1000)
def get_issns_for_journal(nlm_id):
"""Get a list of the ISSN numbers for a journal given its NLM ID.
"""Get a dict of the ISSN numbers for a journal given its NLM ID.
Information on NLM XML DTDs is available at
https://www.nlm.nih.gov/databases/dtd/
Expand All @@ -662,14 +783,16 @@ def get_issns_for_journal(nlm_id):
tree = send_request(pubmed_fetch, params)
if tree is None:
return None
issn_list = tree.findall('.//ISSN')
issn_linking = tree.findall('.//ISSNLinking')
issns = issn_list + issn_linking
issn_list = [(e.attrib.get("IssnType", "other").lower(), e.text)
for e in tree.findall('.//ISSN')]
issn_linking = tree.find('.//ISSNLinking')
if issn_linking:
issn_list.append(("linking", issn_linking.text))

# No ISSNs found!
if not issns:
if not any(v for k, v in issn_list):
return None
else:
return [issn.text for issn in issns]
return issn_list


def expand_pagination(pages):
Expand Down

0 comments on commit 120e861

Please sign in to comment.