Add code to generate retractions resource file

sorgerlab · Dec 14, 2023 · 5427359 · 5427359
1 parent 0440f23
commit 5427359
Showing 1 changed file with 134 additions and 1 deletion.
diff --git a/indra/literature/pubmed_client.py b/indra/literature/pubmed_client.py
@@ -1,7 +1,13 @@
 """
 Search and get metadata for articles in Pubmed.
 """
+import glob
+import gzip
+import os
+import re
 import time
+from pathlib import Path
+
 import tqdm
 import logging
 import random
@@ -11,6 +17,10 @@
 from typing import List
 from functools import lru_cache
 import xml.etree.ElementTree as ET
+
+from bs4 import BeautifulSoup
+
+from indra.resources import RESOURCES_PATH
 from indra.util import UnicodeXMLTreeBuilder as UTB
 from indra.util import batch_iter, pretty_save_xml
 
@@ -19,6 +29,10 @@
 
 pubmed_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
 pubmed_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
+pubmed_archive = "https://ftp.ncbi.nlm.nih.gov/pubmed"
+pubmed_archive_baseline = pubmed_archive + "/baseline/"
+pubmed_archive_update = pubmed_archive + "/updatefiles/"
+RETRACTIONS_FILE = RESOURCES_PATH + "/pmid_retractions.tsv"
 
 
 # Send request can't be cached by lru_cache because it takes a dict
@@ -150,7 +164,7 @@ def get_ids_for_gene(hgnc_name, **kwargs):
     """Get the curated set of articles for a gene in the Entrez database.
 
     Search parameters for the Gene database query can be passed in as
-    keyword arguments. 
+    keyword arguments.
 
     Parameters
     ----------
@@ -952,3 +966,122 @@ def get_publication_types(article: ET.Element):
         A set of publication type
     """
     return {pt.text for pt in article.find('.//PublicationTypeList')}
+
+
+def generate_retractions_file(xml_path: str):
+    """Generate a CSV file of retracted papers from the PubMed XML.
+
+    Parameters
+    ----------
+    xml_path :
+        Path to the directory holding the PubMed XML files. The files will
+        be globbed from this directory using the pattern 'pubmed*.xml.gz'.
+    """
+    ensure_xml_files(xml_path)
+    retractions = set()
+    for xml_file in tqdm.tqdm(
+            glob.glob(os.path.join(xml_path, 'pubmed*.xml.gz')),
+            desc="Processing PubMed XML files"
+    ):
+        xml_str = gzip.open(xml_file).read()
+        tree = ET.XML(xml_str, parser=UTB())
+        for article in tqdm.tqdm(
+            tree.findall('.//PubmedArticle'), unit_scale=True, unit='article'
+        ):
+            pub_types = get_publication_types(article)
+            if 'Retraction' in pub_types:
+                pmid = int(article.find('.//PMID').text)
+                retractions.add(pmid)
+
+    logger.info(f"Writing {len(retractions)} retractions to {RETRACTIONS_FILE}")
+    with gzip.open(RETRACTIONS_FILE, 'wt') as fh:
+        fh.writelines(f'{pmid}\n' for pmid in retractions)
+
+
+def ensure_xml_files(xml_path: str, retries: int = 3):
+    """Ensure that the XML files are downloaded and up to date.
+
+    Parameters
+    ----------
+    xml_path :
+        Path to the directory holding the PubMed XML files. The files will
+        be globbed from this directory using the pattern 'pubmed*.xml.gz'.
+    retries :
+        Number of times to retry downloading an individual XML file if there
+        is an HTTP error. Default: 3.
+    """
+    xml_path = Path(xml_path)
+    xml_path.mkdir(parents=True, exist_ok=True)
+
+    basefiles = [u for u in _get_urls(pubmed_archive_baseline)]
+    updatefiles = [u for u in _get_urls(pubmed_archive_update)]
+
+    # Count successfully downloaded files
+    for xml_url in tqdm.tqdm(
+            basefiles + updatefiles, desc="Downloading PubMed XML files"
+    ):
+        xml_file_path = xml_path.joinpath(xml_url.split("/")[-1])
+        if not xml_file_path.exists():
+            success = _download_xml_gz(xml_url, xml_file_path, retries=retries)
+            if not success:
+                tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")
+
+
+def _get_urls(url: str):
+    """Get the paths to all XML files on the PubMed FTP server."""
+    logger.info("Getting URL paths from %s" % url)
+
+    # Get page
+    response = requests.get(url)
+    response.raise_for_status()
+
+    # Make soup
+    # Todo: see if it's possible to get the lists of files directly from the
+    #  FTP server, rather than scraping the HTML
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Append trailing slash if not present
+    url = url if url.endswith("/") else url + "/"
+
+    # Loop over all links
+    for link in soup.find_all("a"):
+        href = link.get("href")
+        # yield if href matches
+        # 'pubmed<2 digit year>n<4 digit file index>.xml.gz'
+        # but skip the md5 files
+        if href and href.startswith("pubmed") and href.endswith(".xml.gz"):
+            yield url + href
+
+
+def _download_xml_gz(xml_url: str, xml_file: Path, md5_check: bool = True,
+                     retries: int = 3) -> bool:
+    try:
+        resp = requests.get(xml_url)
+        resp.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        if retries > 0:
+            tqdm.tqdm.write(f"Error downloading {xml_url}, retrying." + str(e))
+            sleep(1)
+            return _download_xml_gz(xml_url, xml_file, md5_check, retries - 1)
+        else:
+            tqdm.tqdm.write(f"Error downloading {xml_url}, skipping")
+            return False
+
+    if md5_check:
+        from hashlib import md5
+        md5_resp = requests.get(xml_url + ".md5")
+        checksum = md5(resp.content).hexdigest()
+        expected_checksum = re.search(
+            r"[0-9a-z]+(?=\n)", md5_resp.content.decode("utf-8")
+        ).group()
+        if checksum != expected_checksum:
+            logger.warning(
+                f"Checksum mismatch for {xml_url}, skipping download"
+            )
+            raise ValueError("Checksum mismatch")
+
+    # Write the file xml.gz file
+    with xml_file.open("wb") as fh:
+        fh.write(resp.content)
+
+    return True