Skip to content

Commit 9269ff3

Browse files
Merge pull request #483 from scholarly-python-package/develop
Release v1.7.11
2 parents 00cf1d8 + d0bf5cf commit 9269ff3

File tree

5 files changed

+34
-5
lines changed

5 files changed

+34
-5
lines changed

scholarly/_navigator.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
119119

120120
if resp.status_code == 200 and not has_captcha:
121121
return resp.text
122+
elif resp.status_code == 404:
123+
# If the scholar_id was approximate, it first appears as
124+
# 404 (or 302), and then gets redirected to the correct profile.
125+
# In such cases, we need to try again with the same session.
126+
# See https://github.com/scholarly-python-package/scholarly/issues/469.
127+
self.logger.debug("Got a 404 error. Attempting with same proxy")
128+
tries += 1
129+
continue
122130
elif has_captcha:
123131
self.logger.info("Got a captcha request.")
124132
session = pm._handle_captcha2(pagerequest)

scholarly/_proxy_generator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ def _handle_captcha2(self, url):
451451
return self._session
452452

453453
def _new_session(self, **kwargs):
454-
init_kwargs = {}
454+
init_kwargs = {"follow_redirects": True}
455455
init_kwargs.update(kwargs)
456456
proxies = {}
457457
if self._session:
@@ -610,7 +610,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
610610
# https://www.scraperapi.com/documentation/
611611
self._TIMEOUT = 60
612612

613-
prefix = "http://scraperapi"
613+
prefix = "http://scraperapi.retry_404=true"
614614
if country_code is not None:
615615
prefix += ".country_code=" + country_code
616616
if premium:
@@ -624,7 +624,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
624624
for _ in range(3):
625625
proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001')
626626
if proxy_works:
627-
proxies = {'http://': f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001",}
627+
proxies = {'http://': f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001",}
628628
self.logger.info("ScraperAPI proxy setup successfully")
629629
self._new_session(verify=False, proxies=proxies)
630630
return proxy_works

scholarly/author_parser.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,14 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit:
440440
url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
441441
soup = self.nav._get_soup(url)
442442

443+
# Update scholar_id
444+
scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0]
445+
if scholar_id != author['scholar_id']:
446+
self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. "
447+
"To avoid this warning, use %s to look up this scholar.",
448+
author['scholar_id'], scholar_id, scholar_id)
449+
author["scholar_id"] = scholar_id
450+
443451
if sections == []:
444452
for i in self._sections:
445453
if i not in author['filled']:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.7.10',
8+
version='1.7.11',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
1111
description='Simple access to Google Scholar authors and citations',

test_module.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,12 +264,16 @@ def test_search_author_id_filled(self):
264264
self.assertEqual(author['interests'], [])
265265
self.assertEqual(author['public_access']['available'], 0)
266266
self.assertEqual(author['public_access']['not_available'], 0)
267-
self.assertGreaterEqual(author['citedby'], 2067) # TODO: maybe change
267+
self.assertGreaterEqual(author['citedby'], 2090)
268268
self.assertGreaterEqual(len(author['publications']), 218)
269+
cpy = {1986:4, 2011: 137, 2018: 100}
270+
for year, count in cpy.items():
271+
self.assertEqual(author["cites_per_year"][year], count)
269272
pub = author['publications'][1]
270273
self.assertEqual(pub["citedby_url"],
271274
"https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702")
272275

276+
273277
def test_extract_author_id_list(self):
274278
'''
275279
This unit test tests the extraction of the author id field from the html to populate the `author_id` field
@@ -570,6 +574,15 @@ def test_cites_per_year(self):
570574
for year, count in cpy.items():
571575
self.assertEqual(author['cites_per_year'][year], count)
572576

577+
def test_redirect(self):
578+
"""Test that we can handle redirects when the scholar_id is approximate.
579+
"""
580+
author = scholarly.search_author_id("oMaIg8sAAAAJ")
581+
self.assertEqual(author["scholar_id"], "PEJ42J0AAAAJ")
582+
scholarly.fill(author, sections=["basics"])
583+
self.assertEqual(author["name"], "Kiran Bhatia")
584+
self.assertGreaterEqual(author["citedby"], 135)
585+
573586
class TestScholarlyWithProxy(unittest.TestCase):
574587
@classmethod
575588
def setUpClass(cls):

0 commit comments

Comments
 (0)