Merge pull request #483 from scholarly-python-package/develop

arunkannawadi · web-flow · commit 9269ff36ad23 · 2023-01-16T16:58:12.000-05:00
Release v1.7.11
diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py
@@ -119,6 +119,14 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
 
                 if resp.status_code == 200 and not has_captcha:
                     return resp.text
+                elif resp.status_code == 404:
+                    # If the scholar_id was approximate, it first appears as
+                    # 404 (or 302), and then gets redirected to the correct profile.
+                    # In such cases, we need to try again with the same session.
+                    # See https://github.com/scholarly-python-package/scholarly/issues/469.
+                    self.logger.debug("Got a 404 error. Attempting with same proxy")
+                    tries += 1
+                    continue
                 elif has_captcha:
                     self.logger.info("Got a captcha request.")
                     session = pm._handle_captcha2(pagerequest)
diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py
@@ -451,7 +451,7 @@ def _handle_captcha2(self, url):
         return self._session
 
     def _new_session(self, **kwargs):
-        init_kwargs = {}
+        init_kwargs = {"follow_redirects": True}
         init_kwargs.update(kwargs)
         proxies = {}
         if self._session:
@@ -610,7 +610,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
         # https://www.scraperapi.com/documentation/
         self._TIMEOUT = 60
 
-        prefix = "http://scraperapi"
+        prefix = "http://scraperapi.retry_404=true"
         if country_code is not None:
             prefix += ".country_code=" + country_code
         if premium:
@@ -624,7 +624,7 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
         for _ in range(3):
             proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001')
             if proxy_works:
-                proxies = {'http://': f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001",}
+                proxies = {'http://': f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001",}
                 self.logger.info("ScraperAPI proxy setup successfully")
                 self._new_session(verify=False, proxies=proxies)
                 return proxy_works
diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py
@@ -440,6 +440,14 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit:
             url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
             soup = self.nav._get_soup(url)
 
+            # Update scholar_id
+            scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0]
+            if scholar_id != author['scholar_id']:
+                self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. "
+                                        "To avoid this warning, use %s to look up this scholar.",
+                                        author['scholar_id'], scholar_id, scholar_id)
+                author["scholar_id"] = scholar_id
+
             if sections == []:
                 for i in self._sections:
                     if i not in author['filled']:
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='scholarly',
-    version='1.7.10',
+    version='1.7.11',
     author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
     author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
     description='Simple access to Google Scholar authors and citations',
diff --git a/test_module.py b/test_module.py
@@ -264,12 +264,16 @@ def test_search_author_id_filled(self):
         self.assertEqual(author['interests'], [])
         self.assertEqual(author['public_access']['available'], 0)
         self.assertEqual(author['public_access']['not_available'], 0)
-        self.assertGreaterEqual(author['citedby'], 2067) # TODO: maybe change
+        self.assertGreaterEqual(author['citedby'], 2090)
         self.assertGreaterEqual(len(author['publications']), 218)
+        cpy = {1986:4, 2011: 137, 2018: 100}
+        for year, count in cpy.items():
+            self.assertEqual(author["cites_per_year"][year], count)
         pub = author['publications'][1]
         self.assertEqual(pub["citedby_url"],
                          "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702")
 
+
     def test_extract_author_id_list(self):
         '''
         This unit test tests the extraction of the author id field from the html to populate the `author_id` field
@@ -570,6 +574,15 @@ def test_cites_per_year(self):
         for year, count in cpy.items():
             self.assertEqual(author['cites_per_year'][year], count)
 
+    def test_redirect(self):
+        """Test that we can handle redirects when the scholar_id is approximate.
+        """
+        author = scholarly.search_author_id("oMaIg8sAAAAJ")
+        self.assertEqual(author["scholar_id"], "PEJ42J0AAAAJ")
+        scholarly.fill(author, sections=["basics"])
+        self.assertEqual(author["name"], "Kiran Bhatia")
+        self.assertGreaterEqual(author["citedby"], 135)
+
 class TestScholarlyWithProxy(unittest.TestCase):
     @classmethod
     def setUpClass(cls):