Skip to content

Commit

Permalink
Merge pull request #483 from scholarly-python-package/develop
Browse files Browse the repository at this point in the history
Release v1.7.11
  • Loading branch information
arunkannawadi committed Jan 16, 2023
2 parents 00cf1d8 + d0bf5cf commit 9269ff3
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 5 deletions.
8 changes: 8 additions & 0 deletions scholarly/_navigator.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:

if resp.status_code == 200 and not has_captcha:
return resp.text
elif resp.status_code == 404:
# If the scholar_id was approximate, it first appears as
# 404 (or 302), and then gets redirected to the correct profile.
# In such cases, we need to try again with the same session.
# See https://github.com/scholarly-python-package/scholarly/issues/469.
self.logger.debug("Got a 404 error. Attempting with same proxy")
tries += 1
continue
elif has_captcha:
self.logger.info("Got a captcha request.")
session = pm._handle_captcha2(pagerequest)
Expand Down
6 changes: 3 additions & 3 deletions scholarly/_proxy_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def _handle_captcha2(self, url):
return self._session

def _new_session(self, **kwargs):
init_kwargs = {}
init_kwargs = {"follow_redirects": True}
init_kwargs.update(kwargs)
proxies = {}
if self._session:
Expand Down Expand Up @@ -610,7 +610,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
# https://www.scraperapi.com/documentation/
self._TIMEOUT = 60

prefix = "http://scraperapi"
prefix = "http://scraperapi.retry_404=true"
if country_code is not None:
prefix += ".country_code=" + country_code
if premium:
Expand All @@ -624,7 +624,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
for _ in range(3):
proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001')
if proxy_works:
proxies = {'http://': f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001",}
proxies = {'http://': f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001",}
self.logger.info("ScraperAPI proxy setup successfully")
self._new_session(verify=False, proxies=proxies)
return proxy_works
Expand Down
8 changes: 8 additions & 0 deletions scholarly/author_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,14 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit:
url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
soup = self.nav._get_soup(url)

# Update scholar_id
scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0]
if scholar_id != author['scholar_id']:
self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. "
"To avoid this warning, use %s to look up this scholar.",
author['scholar_id'], scholar_id, scholar_id)
author["scholar_id"] = scholar_id

if sections == []:
for i in self._sections:
if i not in author['filled']:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='scholarly',
version='1.7.10',
version='1.7.11',
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
description='Simple access to Google Scholar authors and citations',
Expand Down
15 changes: 14 additions & 1 deletion test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,16 @@ def test_search_author_id_filled(self):
self.assertEqual(author['interests'], [])
self.assertEqual(author['public_access']['available'], 0)
self.assertEqual(author['public_access']['not_available'], 0)
self.assertGreaterEqual(author['citedby'], 2067) # TODO: maybe change
self.assertGreaterEqual(author['citedby'], 2090)
self.assertGreaterEqual(len(author['publications']), 218)
cpy = {1986:4, 2011: 137, 2018: 100}
for year, count in cpy.items():
self.assertEqual(author["cites_per_year"][year], count)
pub = author['publications'][1]
self.assertEqual(pub["citedby_url"],
"https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702")


def test_extract_author_id_list(self):
'''
This unit test tests the extraction of the author id field from the html to populate the `author_id` field
Expand Down Expand Up @@ -570,6 +574,15 @@ def test_cites_per_year(self):
for year, count in cpy.items():
self.assertEqual(author['cites_per_year'][year], count)

def test_redirect(self):
"""Test that we can handle redirects when the scholar_id is approximate.
"""
author = scholarly.search_author_id("oMaIg8sAAAAJ")
self.assertEqual(author["scholar_id"], "PEJ42J0AAAAJ")
scholarly.fill(author, sections=["basics"])
self.assertEqual(author["name"], "Kiran Bhatia")
self.assertGreaterEqual(author["citedby"], 135)

class TestScholarlyWithProxy(unittest.TestCase):
@classmethod
def setUpClass(cls):
Expand Down

0 comments on commit 9269ff3

Please sign in to comment.