From 11042104071028574508357d73267c39497604ab Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Sat, 14 Jan 2023 18:44:44 -0500 Subject: [PATCH 1/6] Add a cpy check in search_author_id_filled --- test_module.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_module.py b/test_module.py index 7ce4729..75198f7 100644 --- a/test_module.py +++ b/test_module.py @@ -264,12 +264,16 @@ def test_search_author_id_filled(self): self.assertEqual(author['interests'], []) self.assertEqual(author['public_access']['available'], 0) self.assertEqual(author['public_access']['not_available'], 0) - self.assertGreaterEqual(author['citedby'], 2067) # TODO: maybe change + self.assertGreaterEqual(author['citedby'], 2090) self.assertGreaterEqual(len(author['publications']), 218) + cpy = {1986:4, 2011: 137, 2018: 100} + for year, count in cpy.items(): + self.assertEqual(author["cites_per_year"][year], count) pub = author['publications'][1] self.assertEqual(pub["citedby_url"], "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702") + def test_extract_author_id_list(self): ''' This unit test tests the extraction of the author id field from the html to populate the `author_id` field From 630d12ce61c14fe272954d172ca9f6e6ce5d07b5 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Mon, 16 Jan 2023 15:49:51 -0500 Subject: [PATCH 2/6] Retry with same proxy after 404 --- scholarly/_navigator.py | 8 ++++++++ scholarly/_proxy_generator.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py index 0fbda9d..8bea767 100644 --- a/scholarly/_navigator.py +++ b/scholarly/_navigator.py @@ -119,6 +119,14 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: if resp.status_code == 200 and not has_captcha: return resp.text + elif resp.status_code == 404: + # If the scholar_id was approximate, it first appears as + # 404 (or 302), and then gets redirected to the correct profile. + # In such cases, we need to try again with the same session. + # See https://github.com/scholarly-python-package/scholarly/issues/469. + self.logger.debug("Got a 404 error. Attempting with same proxy") + tries += 1 + continue elif has_captcha: self.logger.info("Got a captcha request.") session = pm._handle_captcha2(pagerequest) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index f451ec5..bea892c 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -451,7 +451,7 @@ def _handle_captcha2(self, url): return self._session def _new_session(self, **kwargs): - init_kwargs = {} + init_kwargs = {"follow_redirects": True} init_kwargs.update(kwargs) proxies = {} if self._session: @@ -610,7 +610,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): # https://www.scraperapi.com/documentation/ self._TIMEOUT = 60 - prefix = "http://scraperapi" + prefix = "http://scraperapi.retry_404=true" if country_code is not None: prefix += ".country_code=" + country_code if premium: From 389082bb91d01d9f5a61047d7228839339c7b092 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Mon, 16 Jan 2023 15:50:24 -0500 Subject: [PATCH 3/6] Update scholar_id upon fetching the page --- scholarly/author_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py index 53dad8e..d46038e 100644 --- a/scholarly/author_parser.py +++ b/scholarly/author_parser.py @@ -440,6 +440,14 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit: url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE) soup = self.nav._get_soup(url) + # Update scholar_id + scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0] + if scholar_id != author['scholar_id']: + self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. " + "To avoid this warning, use %s to look up this scholar.", + author['scholar_id'], scholar_id, scholar_id) + author["scholar_id"] = scholar_id + if sections == []: for i in self._sections: if i not in author['filled']: From fb47eaddf3989d3c53f130c9cf7a07f431f7f556 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Mon, 16 Jan 2023 16:41:49 -0500 Subject: [PATCH 4/6] Add a unit test to handle 302/404 --- test_module.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test_module.py b/test_module.py index 75198f7..0effc0f 100644 --- a/test_module.py +++ b/test_module.py @@ -574,6 +574,15 @@ def test_cites_per_year(self): for year, count in cpy.items(): self.assertEqual(author['cites_per_year'][year], count) + def test_redirect(self): + """Test that we can handle redirects when the scholar_id is approximate. + """ + author = scholarly.search_author_id("oMaIg8sAAAAJ") + self.assertEqual(author["scholar_id"], "PEJ42J0AAAAJ") + scholarly.fill(author, sections=["basics"]) + self.assertEqual(author["name"], "Kiran Bhatia") + self.assertGreaterEqual(author["citedby"], 135) + class TestScholarlyWithProxy(unittest.TestCase): @classmethod def setUpClass(cls): From a0e14136f2f00e5a3770cb24c56ff14ce874a3a8 Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Mon, 16 Jan 2023 15:51:19 -0500 Subject: [PATCH 5/6] Fix a bug in using premium options for ScraperAPI --- scholarly/_proxy_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index bea892c..49d5bd5 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -624,7 +624,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): for _ in range(3): proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001') if proxy_works: - proxies = {'http://': f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001",} + proxies = {'http://': f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001",} self.logger.info("ScraperAPI proxy setup successfully") self._new_session(verify=False, proxies=proxies) return proxy_works From d0bf5cfd740e4aba843ac97f4895a76272e94d4f Mon Sep 17 00:00:00 2001 From: arunkannawadi Date: Mon, 16 Jan 2023 16:48:02 -0500 Subject: [PATCH 6/6] Bump version to 1.7.11 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cadec5a..b4ea378 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='scholarly', - version='1.7.10', + version='1.7.11', author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi', author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu', description='Simple access to Google Scholar authors and citations',