Skip to content

Commit 04275ba

Browse files
Merge pull request #449 from scholarly-python-package/citedby1k
Fetch more than 1000 citations
2 parents 04cd545 + 3ae2333 commit 04275ba

File tree

2 files changed

+107
-8
lines changed

2 files changed

+107
-8
lines changed

scholarly/_scholarly.py

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
import copy
66
import csv
77
import pprint
8-
from typing import Dict, List
8+
import datetime
9+
import re
10+
from typing import Dict, List, Union
911
from ._navigator import Navigator
1012
from ._proxy_generator import ProxyGenerator
1113
from dotenv import find_dotenv, load_dotenv
1214
from .author_parser import AuthorParser
1315
from .publication_parser import PublicationParser, _SearchScholarIterator
14-
from .data_types import Author, AuthorSource, Journal, Publication, PublicationSource
16+
from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource
1517

1618
_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
1719
_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
@@ -157,11 +159,11 @@ def search_pubs(self,
157159
sort_by=sort_by, include_last_year=include_last_year, start_index=start_index)
158160
return self.__nav.search_publications(url)
159161

160-
def search_citedby(self, publication_id: int, **kwargs):
162+
def search_citedby(self, publication_id: Union[int, str], **kwargs):
161163
"""Searches by Google Scholar publication id and returns a generator of Publication objects.
162164
163165
:param publication_id: Google Scholar publication id
164-
:type publication_id: int
166+
:type publication_id: int or str
165167
166168
For the remaining parameters, see documentation of `search_pubs`.
167169
"""
@@ -250,20 +252,70 @@ def bibtex(self, object: Publication)->str:
250252
self.logger.warning("Object not supported for bibtex exportation")
251253
return
252254

255+
@staticmethod
256+
def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end):
257+
years = []
258+
y_hi, y_lo = year_end, year_end
259+
running_count = 0
260+
for y in sorted(cites_per_year, reverse=True):
261+
if running_count + cites_per_year[y] <= 1000:
262+
running_count += cites_per_year[y]
263+
y_lo = y
264+
else:
265+
running_count = cites_per_year[y]
266+
years.append((y_hi, y_lo))
267+
y_hi = y
268+
269+
if running_count > 0:
270+
years.append((y_hi, y_lo))
271+
272+
return years
273+
253274
def citedby(self, object: Publication)->_SearchScholarIterator:
254275
"""Searches Google Scholar for other articles that cite this Publication
255276
and returns a Publication generator.
256277
257278
:param object: The Publication object for the bibtex exportation
258279
:type object: Publication
259280
"""
260-
if object['container_type'] == "Publication":
261-
publication_parser = PublicationParser(self.__nav)
262-
return publication_parser.citedby(object)
263-
else:
281+
282+
if object['container_type'] != "Publication":
264283
self.logger.warning("Object not supported for bibtex exportation")
265284
return
266285

286+
if object["num_citations"] <= 1000:
287+
return PublicationParser(self.__nav).citedby(object)
288+
289+
self.logger.debug("Since the paper titled %s has %d citations (>1000), "
290+
"fetching it on an annual basis.", object["bib"]["title"], object["num_citations"])
291+
292+
year_end = int(datetime.date.today().year)
293+
294+
if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY:
295+
self.fill(object)
296+
years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end)
297+
else:
298+
try:
299+
year_low = int(object["bib"]["pub_year"])
300+
except KeyError:
301+
self.logger.warning("Unknown publication year for paper %s, may result in incorrect number "
302+
"of citedby papers.", object["bib"]["title"])
303+
return PublicationParser(self.__nav).citedby(object)
304+
305+
# Go one year at a time in decreasing order
306+
years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1))
307+
308+
# Extract cites_id. Note: There could be multiple ones, separated by commas.
309+
m = re.search("cites=[\d+,]*", object["citedby_url"])
310+
pub_id = m.group()[6:]
311+
for y_hi, y_lo in years:
312+
sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi)
313+
if sub_citations.total_results and (sub_citations.total_results > 1000):
314+
self.logger.warn("The paper titled %s has %d citations in the year %d. "
315+
"Due to the limitation in Google Scholar, fetching only 1000 results "
316+
"from that year.", object["bib"]["title"], sub_citations.total_results, y_lo)
317+
yield from sub_citations
318+
267319
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author:
268320
"""Search by author id and return a single Author object
269321
:param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.

test_module.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import unittest
22
import os
33
import sys
4+
from collections import Counter
45
from scholarly import scholarly, ProxyGenerator
56
from scholarly.data_types import Mandate
67
from scholarly.publication_parser import PublicationParser
@@ -549,6 +550,16 @@ def test_save_journal_leaderboard(self):
549550
if os.path.exists(filename):
550551
os.remove(filename)
551552

553+
def test_bin_citations_by_year(self):
554+
"""Test an internal optimization function to bin cites_per_year
555+
while keeping the citation counts less than 1000 per bin.
556+
"""
557+
cpy = {2022: 490, 2021: 340, 2020:327, 2019:298, 2018: 115, 2017: 49, 2016: 20, 2015: 8, 2014: 3, 2013: 1, 2012: 1}
558+
years = scholarly._bin_citations_by_year(cpy, 2022)
559+
for y_hi, y_lo in years:
560+
self.assertLessEqual(y_lo, y_hi)
561+
self.assertLessEqual(sum(cpy[y] for y in range(y_lo, y_hi+1)), 1000)
562+
552563

553564
class TestScholarlyWithProxy(unittest.TestCase):
554565
@classmethod
@@ -795,5 +806,41 @@ def test_pubs_custom_url(self):
795806
self.assertEqual(pub['bib']['pub_year'], '2009')
796807
self.assertGreaterEqual(pub['num_citations'], 581)
797808

809+
def check_citedby_1k(self, pub):
810+
"""A common checking method to check
811+
"""
812+
original_citation_count = pub["num_citations"]
813+
# Trigger a different code path
814+
if original_citation_count <= 1000:
815+
pub["num_citations"] = 1001
816+
citations = scholarly.citedby(pub)
817+
citation_list = list(citations)
818+
self.assertEqual(len(citation_list), original_citation_count)
819+
return citation_list
820+
821+
@unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup")
822+
def test_citedby_1k_citations(self):
823+
"""Test that scholarly can fetch 1000+ citations from an author
824+
"""
825+
author = scholarly.search_author_id('QoX9bu8AAAAJ')
826+
scholarly.fill(author, sections=['publications'])
827+
pub = [_p for _p in author['publications'] if _p["author_pub_id"]=="QoX9bu8AAAAJ:L8Ckcad2t8MC"][0]
828+
scholarly.fill(pub)
829+
citation_list = self.check_citedby_1k(pub)
830+
831+
yearwise_counter = Counter([c["bib"]["pub_year"] for c in citation_list])
832+
for year, count in pub["cites_per_year"].items():
833+
self.assertEqual(yearwise_counter.get(str(year), 0), count)
834+
835+
@unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup")
836+
def test_citedby_1k_scholar(self):
837+
"""Test that scholarly can fetch 1000+ citations from a pub search.
838+
"""
839+
title = "Persistent entanglement in a class of eigenstates of quantum Heisenberg spin glasses"
840+
pubs = scholarly.search_pubs(title)
841+
pub = next(pubs)
842+
self.check_citedby_1k(pub)
843+
844+
798845
if __name__ == '__main__':
799846
unittest.main()

0 commit comments

Comments
 (0)