add paged_articles to support paging through results #12

mediacloud · Dec 6, 2023 · 1e213c5 · 1e213c5
1 parent 307fa01
commit 1e213c5
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 3 deletions.
diff --git a/waybacknews/searchapi.py b/waybacknews/searchapi.py
@@ -1,5 +1,5 @@
 import datetime as dt
-from typing import List, Dict
+from typing import List, Dict, Optional
 import requests
 import logging
 import ciso8601
@@ -100,7 +100,6 @@ def article(self, article_id: str) -> Dict:
     def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime, page_size: int = 1000, **kwargs):
         """
         @return: a generator that yeilds lists of articles, grouped by page.
-        @Question: Should it return articles one by one, not by page? 
         """
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
@@ -118,6 +117,22 @@ def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetim
                 params['resume'] = next_link_token
                 more_pages = True
 
+    def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime,
+                       page_size: Optional[int] = 1000,  pagination_token: Optional[str] = None, **kwargs):
+        """
+        @return: one page of stories
+        """
+        params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
+        if pagination_token:
+            params['resume'] = pagination_token
+        params.update(kwargs)
+        more_pages = True
+        page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
+        if self._is_no_results(page):
+            return []
+        else:
+            return page, response.headers.get('x-resume-token')
+
     def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict:
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)

diff --git a/waybacknews/tests/test_waybacknews.py b/waybacknews/tests/test_waybacknews.py
@@ -94,13 +94,34 @@ def test_all_articles(self):
         # make sure test case is reasonable size (ie. more than one page, but not too many pages
         assert story_count > 0
         assert story_count < 5000
-        # now text it
+        # now test it
         found_story_count = 0
         for page in self._api.all_articles(query, start_date, end_date):
             assert len(page) > 0
             found_story_count += len(page)
         assert found_story_count == story_count
 
+    def test_paged_articles(self):
+        query = "biden"
+        start_date = dt.datetime(2023, 11, 25)
+        end_date = dt.datetime(2023, 11, 26)
+        story_count = self._api.count(query, start_date, end_date)
+        # make sure test case is reasonable size (ie. more than one page, but not too many pages
+        assert story_count > 0
+        assert story_count < 10000
+        # fetch first page
+        page1, next_token1 = self._api.paged_articles(query, start_date, end_date)
+        assert len(page1) > 0
+        assert next_token1 is not None
+        page1_url1 = page1[0]['url']
+        # grab token, fetch next page
+        page2, next_token2 = self._api.paged_articles(query, start_date, end_date, pagination_token=next_token1)
+        assert len(page2) > 0
+        assert next_token2 is not None
+        assert next_token1 != next_token2  # verify paging token changed
+        page2_urls = [s['url'] for s in page2]
+        assert page1_url1 not in page2_urls  # verify pages don't overlap
+
     def test_top_sources(self):
         results = self._api.top_sources("coronavirus", dt.datetime(2022, 3, 1), dt.datetime(2022, 4, 1))
         assert len(results) > 0