import requests import pandas as pd from typing import List, Tuple, Optional
class PubMedFetcher: BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
def __init__(self, query: str):
self.query = query
def fetch_papers(self) -> List[dict]:
"""Fetch papers from PubMed based on the query."""
url = f"{self.BASE_URL}esearch.fcgi"
params = {
"db": "pubmed",
"term": self.query,
"retmode": "xml",
"retmax": 100
}
response = requests.get(url, params=params)
response.raise_for_status()
return self.parse_paper_ids(response.text)
def parse_paper_ids(self, xml_data: str) -> List[dict]:
"""Parse the XML response to extract paper IDs."""
# Implement XML parsing logic here
# For simplicity, let's assume we get a list of PubMed IDs
# This is a placeholder for actual XML parsing
return [{"PubmedID": "12345678"}] # Replace with actual parsing logic
def fetch_details(self, paper_ids: List[str]) -> List[dict]:
"""Fetch detailed information for each paper."""
ids = ",".join(paper_ids)
url = f"{self.BASE_URL}esummary.fcgi"
params = {
"db": "pubmed",
"id": ids,
"retmode": "xml"
}
response = requests.get(url, params=params)
response.raise_for_status()
return self.parse_details(response.text)
def parse_details(self, xml_data: str) -> List[dict]:
"""Parse the XML response to extract paper details."""
# Implement XML parsing logic here
# This is a placeholder for actual XML parsing
return [{"PubmedID": "12345678", "Title": "Sample Title", "PubDate": "2023-01-01", "Authors": [], "CorrespondingAuthorEmail": ""}] # Replace with actual parsing logic
def filter_non_academic_authors(self, authors: List[str]) -> Tuple[List[str], List[str]]:
"""Filter authors to find non-academic affiliations."""
non_academic_authors = []
company_affiliations = []
for author in authors:
if "university" not in author.lower() and "lab" not in author.lower():
non_academic_authors.append(author)
# Assume we can identify companies by certain keywords
if "pharma" in author.lower() or "biotech" in author.lower():
company_affiliations.append(author)
return non_academic_authors, company_affiliations
def save_to_csv(self, papers: List[dict], filename: str) -> None:
"""Save the papers to a CSV file."""
df = pd.DataFrame(papers)
df.to_csv(filename, index=False)
# cli.py
import argparse import logging from pubmed_fetcher import PubMedFetcher
def main(): parser = argparse.ArgumentParser(description="Fetch research papers from PubMed.") parser.add_argument("query", type=str, help="Search query for PubMed.") parser.add_argument("-f", "--file", type=str, help="Filename to save results.") parser.add_argument("-d", "--debug", action="store_true", help="Enable debug output.")
args = parser.parse_args()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
fetcher = PubMedFetcher(args.query)
try:
paper_ids = fetcher.fetch_papers()
details = fetcher.fetch_details([
from typing import List, Dict, Optional
import csv import logging from Bio import Entrez
logging.basicConfig(level=logging.INFO)
Entrez.email = "your-email@example.com"
def fetch_pubmed_papers(query: str, max_results: int = 10) -> List[Dict]: """ Fetches research papers from PubMed based on the given query. :param query: Search query in PubMed format. :param max_results: Maximum number of results to fetch. :return: List of papers with metadata. """ try: handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results) record = Entrez.read(handle) handle.close() pubmed_ids = record.get("IdList", [])
papers = []
for pubmed_id in pubmed_ids:
handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="text")
details = Entrez.read(handle)
handle.close()
paper_info = extract_paper_details(details)
if paper_info:
papers.append(paper_info)
return papers
except Exception as e:
logging.error(f"Error fetching PubMed data: {e}")
return []
def extract_paper_details(details) -> Optional[Dict]: """Extracts relevant information from a PubMed entry.""" try: article = details["PubmedArticle"][0]["MedlineCitation"]["Article"] authors = article.get("AuthorList", [])
non_academic_authors = []
company_affiliations = []
corresponding_email = None
for author in authors:
if "AffiliationInfo" in author:
affiliation = author["AffiliationInfo"][0].get("Affiliation", "")
if is_non_academic(affiliation):
non_academic_authors.append(author.get("ForeName", "") + " " + author.get("LastName", ""))
company_affiliations.append(affiliation)
if "ElectronicAddress" in author:
corresponding_email = author["ElectronicAddress"]
return {
"PubmedID": details["PubmedArticle"][0]["MedlineCitation"]["PMID"],
"Title": article.get("ArticleTitle", "Unknown"),
"Publication Date": article.get("ArticleDate", [{}])[0].get("Year", "Unknown"),
"Non-academic Author(s)": ", ".join(non_academic_authors),
"Company Affiliation(s)": ", ".join(company_affiliations),
"Corresponding Author Email": corresponding_email or "N/A"
}
except Exception as e:
logging.error(f"Error parsing paper details: {e}")
return None
def is_non_academic(affiliation: str) -> bool: """Determines if an affiliation is non-academic.""" academic_keywords = ["University", "Institute", "College", "School", "Hospital", "Lab"] return not any(word in affiliation for word in academic_keywords)
def save_to_csv(papers: List[Dict], filename: str): """Saves research paper data to a CSV file.""" with open(filename, mode='w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=[ "PubmedID", "Title", "Publication Date", "Non-academic Author(s)", "Company Affiliation(s)", "Corresponding Author Email" ]) writer.writeheader() writer.writerows(papers) logging.info(f"Saved results to {filename}")