Skip to content

Commit

Permalink
Add code to generate retractions resource file
Browse files Browse the repository at this point in the history
  • Loading branch information
kkaris committed Dec 14, 2023
1 parent 0440f23 commit 5427359
Showing 1 changed file with 134 additions and 1 deletion.
135 changes: 134 additions & 1 deletion indra/literature/pubmed_client.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""
Search and get metadata for articles in Pubmed.
"""
import glob
import gzip
import os
import re
import time
from pathlib import Path

import tqdm
import logging
import random
Expand All @@ -11,6 +17,10 @@
from typing import List
from functools import lru_cache
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup

from indra.resources import RESOURCES_PATH
from indra.util import UnicodeXMLTreeBuilder as UTB
from indra.util import batch_iter, pretty_save_xml

Expand All @@ -19,6 +29,10 @@

pubmed_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
pubmed_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
pubmed_archive = "https://ftp.ncbi.nlm.nih.gov/pubmed"
pubmed_archive_baseline = pubmed_archive + "/baseline/"
pubmed_archive_update = pubmed_archive + "/updatefiles/"
RETRACTIONS_FILE = RESOURCES_PATH + "/pmid_retractions.tsv"


# Send request can't be cached by lru_cache because it takes a dict
Expand Down Expand Up @@ -150,7 +164,7 @@ def get_ids_for_gene(hgnc_name, **kwargs):
"""Get the curated set of articles for a gene in the Entrez database.
Search parameters for the Gene database query can be passed in as
keyword arguments.
keyword arguments.
Parameters
----------
Expand Down Expand Up @@ -952,3 +966,122 @@ def get_publication_types(article: ET.Element):
A set of publication type
"""
return {pt.text for pt in article.find('.//PublicationTypeList')}


def generate_retractions_file(xml_path: str):
"""Generate a CSV file of retracted papers from the PubMed XML.
Parameters
----------
xml_path :
Path to the directory holding the PubMed XML files. The files will
be globbed from this directory using the pattern 'pubmed*.xml.gz'.
"""
ensure_xml_files(xml_path)
retractions = set()
for xml_file in tqdm.tqdm(
glob.glob(os.path.join(xml_path, 'pubmed*.xml.gz')),
desc="Processing PubMed XML files"
):
xml_str = gzip.open(xml_file).read()
tree = ET.XML(xml_str, parser=UTB())
for article in tqdm.tqdm(
tree.findall('.//PubmedArticle'), unit_scale=True, unit='article'
):
pub_types = get_publication_types(article)
if 'Retraction' in pub_types:
pmid = int(article.find('.//PMID').text)
retractions.add(pmid)

logger.info(f"Writing {len(retractions)} retractions to {RETRACTIONS_FILE}")
with gzip.open(RETRACTIONS_FILE, 'wt') as fh:
fh.writelines(f'{pmid}\n' for pmid in retractions)


def ensure_xml_files(xml_path: str, retries: int = 3):
"""Ensure that the XML files are downloaded and up to date.
Parameters
----------
xml_path :
Path to the directory holding the PubMed XML files. The files will
be globbed from this directory using the pattern 'pubmed*.xml.gz'.
retries :
Number of times to retry downloading an individual XML file if there
is an HTTP error. Default: 3.
"""
xml_path = Path(xml_path)
xml_path.mkdir(parents=True, exist_ok=True)

basefiles = [u for u in _get_urls(pubmed_archive_baseline)]
updatefiles = [u for u in _get_urls(pubmed_archive_update)]

# Count successfully downloaded files
for xml_url in tqdm.tqdm(
basefiles + updatefiles, desc="Downloading PubMed XML files"
):
xml_file_path = xml_path.joinpath(xml_url.split("/")[-1])
if not xml_file_path.exists():
success = _download_xml_gz(xml_url, xml_file_path, retries=retries)
if not success:
tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")


def _get_urls(url: str):
"""Get the paths to all XML files on the PubMed FTP server."""
logger.info("Getting URL paths from %s" % url)

# Get page
response = requests.get(url)
response.raise_for_status()

# Make soup
# Todo: see if it's possible to get the lists of files directly from the
# FTP server, rather than scraping the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Append trailing slash if not present
url = url if url.endswith("/") else url + "/"

# Loop over all links
for link in soup.find_all("a"):
href = link.get("href")
# yield if href matches
# 'pubmed<2 digit year>n<4 digit file index>.xml.gz'
# but skip the md5 files
if href and href.startswith("pubmed") and href.endswith(".xml.gz"):
yield url + href


def _download_xml_gz(xml_url: str, xml_file: Path, md5_check: bool = True,
retries: int = 3) -> bool:
try:
resp = requests.get(xml_url)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
if retries > 0:
tqdm.tqdm.write(f"Error downloading {xml_url}, retrying." + str(e))
sleep(1)
return _download_xml_gz(xml_url, xml_file, md5_check, retries - 1)
else:
tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")
return False

if md5_check:
from hashlib import md5
md5_resp = requests.get(xml_url + ".md5")
checksum = md5(resp.content).hexdigest()
expected_checksum = re.search(
r"[0-9a-z]+(?=\n)", md5_resp.content.decode("utf-8")
).group()
if checksum != expected_checksum:
logger.warning(
f"Checksum mismatch for {xml_url}, skipping download"
)
raise ValueError("Checksum mismatch")

# Write the file xml.gz file
with xml_file.open("wb") as fh:
fh.write(resp.content)

return True

0 comments on commit 5427359

Please sign in to comment.