Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.1] - 2026-02-16

### Changed
- arXiv categories are now batched into a single OR query (`cat:cs.AI OR cat:cs.CL`), reducing N parallel API calls to 1
- `page_size` now matches `min(max_results, 100)` instead of always requesting 100 results
- `ThreadPoolExecutor` removed from `fetch_recent_papers()` since only one API call is made
- `--force-refresh` now fetches today's papers (1-day window) instead of a full 7-day backfill; the 7-day bootstrap is reserved for first runs only
- Quick Start no longer recommends `--force-refresh` for the initial run since `paperweight run` already backfills automatically

### Added
- RSS feed fetcher (`fetch_rss_papers`) for daily lookups — no rate limits, sub-second metadata fetch
- RSS-first routing in `fetch_recent_papers`: daily runs try RSS before falling back to the arXiv API
- Exponential backoff (via `tenacity`) on `arxiv.HTTPError` with waits of 5 → 15 → 45 → 90 s
- `ArxivRateLimitError` exception with user-friendly message for HTTP 429 responses

## [0.3.0] - 2026-02-15

### Added
Expand Down Expand Up @@ -87,7 +102,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Email notification system
- YAML-based configuration

[Unreleased]: https://github.com/seanbrar/paperweight/compare/v0.3.0...HEAD
[Unreleased]: https://github.com/seanbrar/paperweight/compare/v0.3.1...HEAD
[0.3.1]: https://github.com/seanbrar/paperweight/compare/v0.3.0...v0.3.1
[0.3.0]: https://github.com/seanbrar/paperweight/compare/v0.2.0...v0.3.0
[0.2.0]: https://github.com/seanbrar/paperweight/compare/v0.1.2...v0.2.0
[0.1.2]: https://github.com/seanbrar/paperweight/compare/v0.1.1...v0.1.2
Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,15 @@ source .venv/bin/activate
## Quick start (works without API keys)

```bash
paperweight init # create config.yaml with safe defaults
paperweight doctor # check your setup for issues
paperweight run --force-refresh # fetch papers and produce a digest
paperweight init # create config.yaml with safe defaults
paperweight doctor # check your setup for issues
paperweight run # fetch papers and produce a digest
```

The first run automatically backfills a week of papers. After that, the same
`paperweight run` fetches only what's new. Use `--force-refresh` to re-fetch
if you've already run today.

Notes:

- Default analyzer mode is `abstract` (no API key required).
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "academic-paperweight"
version = "0.3.0"
version = "0.3.1"
description = "Automated retrieval, filtering, and LLM-powered summarization of arXiv papers based on your research interests."
readme = "README.md"
requires-python = ">=3.11, <3.14"
Expand Down
82 changes: 41 additions & 41 deletions src/mocks/local_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@


def mock_fetch_paper_content(
paper_id: str,
files_dir: Path = DEFAULT_FILES_DIR
paper_id: str, files_dir: Path = DEFAULT_FILES_DIR
) -> Tuple[Optional[bytes], Optional[str]]:
"""Mock replacement for paperweight.scraper.fetch_paper_content.

Expand All @@ -37,7 +36,7 @@ def mock_fetch_paper_content(
or (None, None) if no file found.
"""
# Normalize paper_id - strip version if present for base lookup
base_id = paper_id.split('v')[0] if 'v' in paper_id else paper_id
base_id = paper_id.split("v")[0] if "v" in paper_id else paper_id

# Try different ID patterns (with/without version)
id_patterns = [paper_id]
Expand All @@ -48,7 +47,7 @@ def mock_fetch_paper_content(
if paper_id == base_id:
# Look for any versioned file
for f in files_dir.glob(f"{base_id}v*.tar.gz"):
id_patterns.insert(0, f.stem.replace('.tar', ''))
id_patterns.insert(0, f.stem.replace(".tar", ""))
break
for f in files_dir.glob(f"{base_id}v*.pdf"):
if f.stem not in id_patterns:
Expand All @@ -70,17 +69,17 @@ def mock_fetch_paper_content(


def mock_fetch_arxiv_papers(
category: str,
categories: List[str],
start_date: Any,
max_results: Optional[int] = None,
db_path: Path = DEFAULT_DB_PATH
db_path: Path = DEFAULT_DB_PATH,
) -> List[Dict[str, Any]]:
"""Mock replacement for paperweight.scraper.fetch_arxiv_papers.

Reads paper metadata from local SQLite database instead of arXiv API.

Args:
category: The arXiv category to filter by (e.g., "cs.AI")
categories: arXiv categories to filter by (e.g., ``['cs.AI', 'cs.CL']``)
start_date: Not used in mock (we return all matching papers)
max_results: Maximum number of results to return
db_path: Path to the SQLite database
Expand All @@ -95,8 +94,10 @@ def mock_fetch_arxiv_papers(
conn.row_factory = sqlite3.Row
cursor = conn.cursor()

sql = "SELECT * FROM papers WHERE categories LIKE ?"
params: List[Any] = [f"%{category}%"]
# Build category filter with OR logic
cat_conditions = " OR ".join(["categories LIKE ?" for _ in categories])
sql = f"SELECT * FROM papers WHERE ({cat_conditions})"
params: List[Any] = [f"%{cat}%" for cat in categories]

if max_results:
sql += " LIMIT ?"
Expand All @@ -107,12 +108,14 @@ def mock_fetch_arxiv_papers(

papers = []
for row in rows:
papers.append({
"title": row["title"],
"link": f"http://arxiv.org/abs/{row['id']}",
"date": datetime.fromisoformat(row["published"]).date(),
"abstract": row["abstract"],
})
papers.append(
{
"title": row["title"],
"link": f"http://arxiv.org/abs/{row['id']}",
"date": datetime.fromisoformat(row["published"]).date(),
"abstract": row["abstract"],
}
)

conn.close()
return papers
Expand All @@ -132,18 +135,17 @@ def patch_scraper_for_local_mirror(monkeypatch, files_dir: Path = DEFAULT_FILES_
def patched_scraper(monkeypatch):
patch_scraper_for_local_mirror(monkeypatch)
"""

def local_fetch_paper_content(paper_id):
return mock_fetch_paper_content(paper_id, files_dir)

monkeypatch.setattr(
"paperweight.scraper.fetch_paper_content",
local_fetch_paper_content
"paperweight.scraper.fetch_paper_content", local_fetch_paper_content
)

# Also patch the retry-decorated wrapper if needed
monkeypatch.setattr(
"paperweight.scraper.fetch_arxiv_papers",
mock_fetch_arxiv_papers
"paperweight.scraper.fetch_arxiv_papers", mock_fetch_arxiv_papers
)


Expand All @@ -159,7 +161,7 @@ def __init__(
page_size: int = 100,
delay_seconds: float = 3,
num_retries: int = 3,
mirror_path: Path = DEFAULT_MIRROR_PATH
mirror_path: Path = DEFAULT_MIRROR_PATH,
):
self.page_size = page_size
self.delay_seconds = delay_seconds
Expand All @@ -174,17 +176,15 @@ def __init__(
)

def results(
self,
search: arxiv.Search,
offset: int = 0
self, search: arxiv.Search, offset: int = 0
) -> Generator[arxiv.Result, None, None]:
"""Execute search against local SQLite database."""
conn = sqlite3.connect(self.mirror_db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()

query_str = getattr(search, 'query', '')
id_list = getattr(search, 'id_list', [])
query_str = getattr(search, "query", "")
id_list = getattr(search, "id_list", [])

sql = "SELECT * FROM papers WHERE 1=1"
params: List[Any] = []
Expand All @@ -211,7 +211,7 @@ def results(
params.append(f"%{term}%")
params.append(f"%{term}%")

max_results = getattr(search, 'max_results', None)
max_results = getattr(search, "max_results", None)
if max_results:
sql += " LIMIT ?"
params.append(int(max_results))
Expand All @@ -231,29 +231,29 @@ class Author:
def __init__(self, name: str):
self.name = name

authors = [Author(n.strip()) for n in row['authors'].split(',')]
paper_id = row['id']
authors = [Author(n.strip()) for n in row["authors"].split(",")]
paper_id = row["id"]

res = arxiv.Result(
entry_id=f"http://arxiv.org/abs/{paper_id}",
updated=datetime.fromisoformat(row['updated']),
published=datetime.fromisoformat(row['published']),
title=row['title'],
updated=datetime.fromisoformat(row["updated"]),
published=datetime.fromisoformat(row["published"]),
title=row["title"],
authors=authors,
summary=row['abstract'],
summary=row["abstract"],
comment=None,
journal_ref=None,
doi=row['doi'],
primary_category=row['categories'].split(',')[0].strip(),
categories=[cat.strip() for cat in row['categories'].split(',')],
links=[]
doi=row["doi"],
primary_category=row["categories"].split(",")[0].strip(),
categories=[cat.strip() for cat in row["categories"].split(",")],
links=[],
)

# Monkey-patch download methods to use local files
local_pdf_path = row['local_file_path']
local_source_path = row['local_source_path']
local_pdf_path = row["local_file_path"]
local_source_path = row["local_source_path"]

def mock_download_pdf(dirpath: str = './', filename: str = '') -> str:
def mock_download_pdf(dirpath: str = "./", filename: str = "") -> str:
if not filename:
filename = f"{paper_id}.pdf"
target_path = Path(dirpath) / filename
Expand All @@ -263,7 +263,7 @@ def mock_download_pdf(dirpath: str = './', filename: str = '') -> str:
return str(target_path)
raise FileNotFoundError(f"Mock PDF file missing for {paper_id}")

def mock_download_source(dirpath: str = './', filename: str = '') -> str:
def mock_download_source(dirpath: str = "./", filename: str = "") -> str:
if not filename:
filename = f"{paper_id}.tar.gz"
target_path = Path(dirpath) / filename
Expand All @@ -275,6 +275,6 @@ def mock_download_source(dirpath: str = './', filename: str = '') -> str:

res.download_pdf = mock_download_pdf # type: ignore
res.download_source = mock_download_source # type: ignore
res.pdf_url = row['pdf_url']
res.pdf_url = row["pdf_url"]

return res
3 changes: 2 additions & 1 deletion src/paperweight/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
setup_and_get_papers,
summarize_scored_papers,
)
from paperweight.scraper import get_recent_papers # noqa: E402
from paperweight.scraper import ArxivRateLimitError, get_recent_papers # noqa: E402
from paperweight.utils import load_config # noqa: E402

__all__ = [
"__version__",
"ArxivRateLimitError",
"get_recent_papers",
"load_config",
"process_and_summarize_papers",
Expand Down
39 changes: 28 additions & 11 deletions src/paperweight/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ def get_abstracts(processed_papers, config, *, summary_concurrency=None):
if analysis_type == "abstract":
return [paper["abstract"] for paper in processed_papers]
if analysis_type == "summary":
return summarize_papers(processed_papers, config, summary_concurrency=summary_concurrency)
return summarize_papers(
processed_papers, config, summary_concurrency=summary_concurrency
)
raise ValueError(f"Unknown analysis type: {analysis_type}")


Expand Down Expand Up @@ -115,9 +117,7 @@ def _resolve_triage_model_config(
analyzer_cfg = full_config.get("analyzer", {})

provider = (
triage_cfg.get("llm_provider")
or analyzer_cfg.get("llm_provider")
or "openai"
triage_cfg.get("llm_provider") or analyzer_cfg.get("llm_provider") or "openai"
).lower()
model = triage_cfg.get("model") or _default_model_for_provider(provider)
api_key = (
Expand Down Expand Up @@ -206,7 +206,9 @@ async def _triage_one_paper_async(prompt, pollux_config, *, min_score):
return _parse_triage_decision(answer, min_score=min_score)


async def _run_triage_async(prompts, pollux_config, *, min_score, concurrency=TRIAGE_CONCURRENCY):
async def _run_triage_async(
prompts, pollux_config, *, min_score, concurrency=TRIAGE_CONCURRENCY
):
"""Run triage prompts concurrently with a semaphore, returning decisions in order."""
semaphore = asyncio.Semaphore(concurrency)
total = len(prompts)
Expand Down Expand Up @@ -277,11 +279,18 @@ def triage_papers(

prompts = [_build_triage_prompt(paper, profile_text) for paper in papers]

triage_concurrency = full_config.get("concurrency", {}).get("triage", TRIAGE_CONCURRENCY)
triage_concurrency = full_config.get("concurrency", {}).get(
"triage", TRIAGE_CONCURRENCY
)

try:
decisions = asyncio.run(
_run_triage_async(prompts, pollux_config, min_score=min_score, concurrency=triage_concurrency)
_run_triage_async(
prompts,
pollux_config,
min_score=min_score,
concurrency=triage_concurrency,
)
)
except Exception as exc:
logger.warning(
Expand Down Expand Up @@ -357,9 +366,13 @@ async def _summarize_one_paper_async(
return str(response)


def _resolve_summary_model_config(config: Dict[str, Any]) -> tuple[ProviderName, str, str]:
def _resolve_summary_model_config(
config: Dict[str, Any],
) -> tuple[ProviderName, str, str]:
llm_provider = (config.get("llm_provider") or "openai").lower().strip()
api_key = config.get("api_key") or os.getenv(f"{llm_provider.upper()}_API_KEY") or ""
api_key = (
config.get("api_key") or os.getenv(f"{llm_provider.upper()}_API_KEY") or ""
)
if llm_provider not in ("openai", "gemini") or not api_key:
raise ValueError(
"Summary analyzer requires a valid llm_provider (openai|gemini) and api_key."
Expand All @@ -385,7 +398,9 @@ def summarize_papers( # noqa: C901
provider, model_name, api_key = _resolve_summary_model_config(config)
max_input_tokens = _int_setting(config.get("max_input_tokens"), 7000, minimum=500)
max_input_chars = _int_setting(config.get("max_input_chars"), 20_000, minimum=1000)
effective_concurrency = summary_concurrency if summary_concurrency is not None else SUMMARY_CONCURRENCY
effective_concurrency = (
summary_concurrency if summary_concurrency is not None else SUMMARY_CONCURRENCY
)

pollux_config = Config(
provider=provider,
Expand All @@ -399,7 +414,9 @@ def summarize_papers( # noqa: C901
),
)

async def _run_summary_batch() -> tuple[List[str | None], List[tuple[int, BaseException]]]:
async def _run_summary_batch() -> (
tuple[List[str | None], List[tuple[int, BaseException]]]
):
semaphore = asyncio.Semaphore(effective_concurrency)
results: List[str | None] = [None] * len(papers)
failures: List[tuple[int, BaseException]] = []
Expand Down
4 changes: 1 addition & 3 deletions src/paperweight/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ def is_db_enabled(config: Dict[str, Any]) -> bool:


@contextmanager
def connect_db(
db_config: Dict[str, Any], autocommit: bool = False
) -> Generator:
def connect_db(db_config: Dict[str, Any], autocommit: bool = False) -> Generator:
"""Create a database connection.

Args:
Expand Down
Loading