|
6 | 6 | import csv |
7 | 7 | import pprint |
8 | 8 | import datetime |
9 | | -import itertools |
10 | | -import warnings |
11 | 9 | from typing import Dict, List |
| 10 | +import re |
12 | 11 | from ._navigator import Navigator |
13 | 12 | from ._proxy_generator import ProxyGenerator |
14 | 13 | from dotenv import find_dotenv, load_dotenv |
@@ -284,23 +283,38 @@ def citedby(self, object: Publication)->_SearchScholarIterator: |
284 | 283 | self.logger.warning("Object not supported for bibtex exportation") |
285 | 284 | return |
286 | 285 |
|
287 | | - if object["bib"]["citedby"] < 999: |
| 286 | + if object["num_citations"] <= 1000: |
288 | 287 | return PublicationParser(self.__nav).citedby(object) |
| 288 | + |
| 289 | + self.logger.debug("Since the paper titled %s has %d citations (>1000), " |
| 290 | + "fetching it on an annual basis.", object["bib"]["title"], object["num_citations"]) |
| 291 | + |
| 292 | + year_end = int(datetime.date.today().year) |
| 293 | + |
| 294 | + if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: |
| 295 | + self.fill(object) |
| 296 | + years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end) |
289 | 297 | else: |
290 | 298 | try: |
291 | 299 | year_low = int(object["bib"]["pub_year"]) |
292 | | - year_end = int(datetime.date.today().year) |
293 | 300 | except KeyError: |
294 | | - self.logger.warning("Unknown publication year for paper %s, may result in incorrect number of citedby papers.", object["bib"]["title"]) |
| 301 | + self.logger.warning("Unknown publication year for paper %s, may result in incorrect number " |
| 302 | + "of citedby papers.", object["bib"]["title"]) |
295 | 303 | return PublicationParser(self.__nav).citedby(object) |
296 | 304 |
|
297 | | - pub_id = int(object["citedby_url"].split("=")[1].split("&")[0]) |
298 | | - iter_list = [] |
299 | | - while year_low < year_end: |
300 | | - iter_list.append(self.search_citedby(publication_id=pub_id, year_low=year_low, year_high=year_low+1)) |
301 | | - year_low += 1 |
302 | | - |
303 | | - return itertools.chain(*iter_list) |
| 305 | + # Go one year at a time in decreasing order |
| 306 | + years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1)) |
| 307 | + |
| 308 | + # Extract cites_id. Note: There could be multiple ones, separated by commas. |
| 309 | + m = re.search("cites=[\d+,]*", object["citedby_url"]) |
| 310 | + pub_id = m.group()[6:] |
| 311 | + for y_hi, y_lo in years: |
| 312 | + sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi) |
| 313 | + if sub_citations.total_results and (sub_citations.total_results > 1000): |
| 314 | + self.logger.warn("The paper titled %s has %d citations in the year %d. " |
| 315 | + "Due to the limitation in Google Scholar, fetching only 1000 results " |
| 316 | + "from that year.", object["bib"]["title"], sub_citations.total_results, y_lo) |
| 317 | + yield from sub_citations |
304 | 318 |
|
305 | 319 | def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author: |
306 | 320 | """Search by author id and return a single Author object |
|
0 commit comments