Skip to content

Commit

Permalink
add paged_articles to support paging through results #12
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Dec 6, 2023
1 parent 307fa01 commit 1e213c5
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
19 changes: 17 additions & 2 deletions waybacknews/searchapi.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import datetime as dt
from typing import List, Dict
from typing import List, Dict, Optional
import requests
import logging
import ciso8601
Expand Down Expand Up @@ -100,7 +100,6 @@ def article(self, article_id: str) -> Dict:
def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime, page_size: int = 1000, **kwargs):
"""
@return: a generator that yeilds lists of articles, grouped by page.
@Question: Should it return articles one by one, not by page?
"""
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
params.update(kwargs)
Expand All @@ -118,6 +117,22 @@ def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetim
params['resume'] = next_link_token
more_pages = True

def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime,
page_size: Optional[int] = 1000, pagination_token: Optional[str] = None, **kwargs):
"""
@return: one page of stories
"""
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
if pagination_token:
params['resume'] = pagination_token
params.update(kwargs)
more_pages = True
page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
if self._is_no_results(page):
return []
else:
return page, response.headers.get('x-resume-token')

def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict:
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
params.update(kwargs)
Expand Down
23 changes: 22 additions & 1 deletion waybacknews/tests/test_waybacknews.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,34 @@ def test_all_articles(self):
# make sure test case is reasonable size (ie. more than one page, but not too many pages
assert story_count > 0
assert story_count < 5000
# now text it
# now test it
found_story_count = 0
for page in self._api.all_articles(query, start_date, end_date):
assert len(page) > 0
found_story_count += len(page)
assert found_story_count == story_count

def test_paged_articles(self):
query = "biden"
start_date = dt.datetime(2023, 11, 25)
end_date = dt.datetime(2023, 11, 26)
story_count = self._api.count(query, start_date, end_date)
# make sure test case is reasonable size (ie. more than one page, but not too many pages
assert story_count > 0
assert story_count < 10000
# fetch first page
page1, next_token1 = self._api.paged_articles(query, start_date, end_date)
assert len(page1) > 0
assert next_token1 is not None
page1_url1 = page1[0]['url']
# grab token, fetch next page
page2, next_token2 = self._api.paged_articles(query, start_date, end_date, pagination_token=next_token1)
assert len(page2) > 0
assert next_token2 is not None
assert next_token1 != next_token2 # verify paging token changed
page2_urls = [s['url'] for s in page2]
assert page1_url1 not in page2_urls # verify pages don't overlap

def test_top_sources(self):
results = self._api.top_sources("coronavirus", dt.datetime(2022, 3, 1), dt.datetime(2022, 4, 1))
assert len(results) > 0
Expand Down

0 comments on commit 1e213c5

Please sign in to comment.