|
5 | 5 | import copy |
6 | 6 | import csv |
7 | 7 | import pprint |
8 | | -from typing import Dict, List |
| 8 | +import datetime |
| 9 | +import re |
| 10 | +from typing import Dict, List, Union |
9 | 11 | from ._navigator import Navigator |
10 | 12 | from ._proxy_generator import ProxyGenerator |
11 | 13 | from dotenv import find_dotenv, load_dotenv |
12 | 14 | from .author_parser import AuthorParser |
13 | 15 | from .publication_parser import PublicationParser, _SearchScholarIterator |
14 | | -from .data_types import Author, AuthorSource, Journal, Publication, PublicationSource |
| 16 | +from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource |
15 | 17 |
|
16 | 18 | _AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}' |
17 | 19 | _KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}' |
@@ -157,11 +159,11 @@ def search_pubs(self, |
157 | 159 | sort_by=sort_by, include_last_year=include_last_year, start_index=start_index) |
158 | 160 | return self.__nav.search_publications(url) |
159 | 161 |
|
160 | | - def search_citedby(self, publication_id: int, **kwargs): |
| 162 | + def search_citedby(self, publication_id: Union[int, str], **kwargs): |
161 | 163 | """Searches by Google Scholar publication id and returns a generator of Publication objects. |
162 | 164 |
|
163 | 165 | :param publication_id: Google Scholar publication id |
164 | | - :type publication_id: int |
| 166 | + :type publication_id: int or str |
165 | 167 |
|
166 | 168 | For the remaining parameters, see documentation of `search_pubs`. |
167 | 169 | """ |
@@ -250,20 +252,70 @@ def bibtex(self, object: Publication)->str: |
250 | 252 | self.logger.warning("Object not supported for bibtex exportation") |
251 | 253 | return |
252 | 254 |
|
| 255 | + @staticmethod |
| 256 | + def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end): |
| 257 | + years = [] |
| 258 | + y_hi, y_lo = year_end, year_end |
| 259 | + running_count = 0 |
| 260 | + for y in sorted(cites_per_year, reverse=True): |
| 261 | + if running_count + cites_per_year[y] <= 1000: |
| 262 | + running_count += cites_per_year[y] |
| 263 | + y_lo = y |
| 264 | + else: |
| 265 | + running_count = cites_per_year[y] |
| 266 | + years.append((y_hi, y_lo)) |
| 267 | + y_hi = y |
| 268 | + |
| 269 | + if running_count > 0: |
| 270 | + years.append((y_hi, y_lo)) |
| 271 | + |
| 272 | + return years |
| 273 | + |
253 | 274 | def citedby(self, object: Publication)->_SearchScholarIterator: |
254 | 275 | """Searches Google Scholar for other articles that cite this Publication |
255 | 276 | and returns a Publication generator. |
256 | 277 |
|
257 | 278 | :param object: The Publication object for the bibtex exportation |
258 | 279 | :type object: Publication |
259 | 280 | """ |
260 | | - if object['container_type'] == "Publication": |
261 | | - publication_parser = PublicationParser(self.__nav) |
262 | | - return publication_parser.citedby(object) |
263 | | - else: |
| 281 | + |
| 282 | + if object['container_type'] != "Publication": |
264 | 283 | self.logger.warning("Object not supported for bibtex exportation") |
265 | 284 | return |
266 | 285 |
|
| 286 | + if object["num_citations"] <= 1000: |
| 287 | + return PublicationParser(self.__nav).citedby(object) |
| 288 | + |
| 289 | + self.logger.debug("Since the paper titled %s has %d citations (>1000), " |
| 290 | + "fetching it on an annual basis.", object["bib"]["title"], object["num_citations"]) |
| 291 | + |
| 292 | + year_end = int(datetime.date.today().year) |
| 293 | + |
| 294 | + if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: |
| 295 | + self.fill(object) |
| 296 | + years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end) |
| 297 | + else: |
| 298 | + try: |
| 299 | + year_low = int(object["bib"]["pub_year"]) |
| 300 | + except KeyError: |
| 301 | + self.logger.warning("Unknown publication year for paper %s, may result in incorrect number " |
| 302 | + "of citedby papers.", object["bib"]["title"]) |
| 303 | + return PublicationParser(self.__nav).citedby(object) |
| 304 | + |
| 305 | + # Go one year at a time in decreasing order |
| 306 | + years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1)) |
| 307 | + |
| 308 | + # Extract cites_id. Note: There could be multiple ones, separated by commas. |
| 309 | + m = re.search("cites=[\d+,]*", object["citedby_url"]) |
| 310 | + pub_id = m.group()[6:] |
| 311 | + for y_hi, y_lo in years: |
| 312 | + sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi) |
| 313 | + if sub_citations.total_results and (sub_citations.total_results > 1000): |
| 314 | + self.logger.warn("The paper titled %s has %d citations in the year %d. " |
| 315 | + "Due to the limitation in Google Scholar, fetching only 1000 results " |
| 316 | + "from that year.", object["bib"]["title"], sub_citations.total_results, y_lo) |
| 317 | + yield from sub_citations |
| 318 | + |
267 | 319 | def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author: |
268 | 320 | """Search by author id and return a single Author object |
269 | 321 | :param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. |
|
0 commit comments