From 2d6ad161782ea930073cd6b4327a666f4754fbb7 Mon Sep 17 00:00:00 2001 From: cyy Date: Fri, 25 Apr 2025 22:44:48 +0800 Subject: [PATCH] Add ruff and associated fixes Signed-off-by: cyy --- docs/conf.py | 4 +- pyproject.toml | 18 + scholarly/__init__.py | 3 +- scholarly/_navigator.py | 189 ++++--- scholarly/_proxy_generator.py | 262 ++++++---- scholarly/_scholarly.py | 368 +++++++++----- scholarly/author_parser.py | 355 +++++++------ scholarly/data_types.py | 52 +- scholarly/publication_parser.py | 471 +++++++++-------- setup.py | 36 +- test_module.py | 860 ++++++++++++++++++-------------- 11 files changed, 1532 insertions(+), 1086 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index fd06d34d..81b657dd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,7 +12,7 @@ # import os import sys -import sphinx_rtd_theme + sys.path.insert(0, os.path.abspath('..')) @@ -75,4 +75,4 @@ # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False \ No newline at end of file +todo_include_todos = False diff --git a/pyproject.toml b/pyproject.toml index 9787c3bd..883c52aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,21 @@ [build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" + +[tool.ruff] +target-version = "py39" +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # isort + "I", + "RUF013" +] +ignore = ["E501", "F401"] diff --git a/scholarly/__init__.py b/scholarly/__init__.py index f339bc94..be80a644 100644 --- a/scholarly/__init__.py +++ b/scholarly/__init__.py @@ -1,4 +1,5 @@ +from ._proxy_generator import DOSException, MaxTriesExceededException, ProxyGenerator from ._scholarly import _Scholarly from .data_types import Author, Publication -from ._proxy_generator import ProxyGenerator, DOSException, MaxTriesExceededException + scholarly = _Scholarly() diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py index 8bea7675..7f53e242 100644 --- a/scholarly/_navigator.py +++ b/scholarly/_navigator.py @@ -1,22 +1,18 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from ._proxy_generator import ProxyGenerator, MaxTriesExceededException, DOSException - -from bs4 import BeautifulSoup - import codecs +import contextlib import logging import random import time -from requests.exceptions import Timeout + +from bs4 import BeautifulSoup from httpx import TimeoutException +from requests.exceptions import Timeout from selenium.webdriver.common.by import By -from .publication_parser import _SearchScholarIterator + +from ._proxy_generator import DOSException, MaxTriesExceededException, ProxyGenerator from .author_parser import AuthorParser -from .publication_parser import PublicationParser -from .data_types import Author, PublicationSource, ProxyMode +from .data_types import Author, ProxyMode, PublicationSource +from .publication_parser import PublicationParser, _SearchScholarIterator class Singleton(type): @@ -24,17 +20,16 @@ class Singleton(type): def __call__(cls, *args, **kwargs): if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, - **kwargs) + cls._instances[cls] = super().__call__(*args, **kwargs) return cls._instances[cls] -class Navigator(object, metaclass=Singleton): +class Navigator(metaclass=Singleton): """A class used to navigate pages on google scholar.""" def __init__(self): - super(Navigator, self).__init__() - self.logger = logging.getLogger('scholarly') + super().__init__() + self.logger = logging.getLogger("scholarly") self._TIMEOUT = 5 self._max_retries = 5 # A Navigator instance has two proxy managers, each with their session. @@ -46,11 +41,10 @@ def __init__(self): self._session2 = self.pm2.get_session() self.got_403 = False - def set_logger(self, enable: bool): """Enable or disable the logger for google scholar.""" - self.logger.setLevel((logging.INFO if enable else logging.CRITICAL)) + self.logger.setLevel(logging.INFO if enable else logging.CRITICAL) def set_timeout(self, timeout: int): """Set timeout period in seconds for scholarly""" @@ -67,8 +61,10 @@ def use_proxy(self, pg1: ProxyGenerator, pg2: ProxyGenerator = None): self.pm2 = ProxyGenerator() proxy_works = self.pm2.FreeProxies() if not proxy_works: - self.logger.info("FreeProxy as a secondary proxy is not working. " - "Using the primary proxy for all requests") + self.logger.info( + "FreeProxy as a secondary proxy is not working. " + "Using the primary proxy for all requests" + ) self.pm2 = pg1 self._session1 = self.pm1.get_session() @@ -81,7 +77,6 @@ def _new_session(self, premium=True, **kwargs): else: self._session2 = self.pm2._new_session(**kwargs) - def _get_page(self, pagerequest: str, premium: bool = False) -> str: """Return the data from a webpage @@ -106,14 +101,16 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: premium = True if pm.proxy_mode is ProxyMode.SCRAPERAPI: self.set_timeout(60) - timeout=self._TIMEOUT + timeout = self._TIMEOUT while tries < self._max_retries: try: - w = random.uniform(1,2) + w = random.uniform(1, 2) time.sleep(w) resp = session.get(pagerequest, timeout=timeout) - if premium is False: # premium methods may contain sensitive information - self.logger.debug("Session proxy config is {}".format(pm._proxies)) + if ( + premium is False + ): # premium methods may contain sensitive information + self.logger.debug(f"Session proxy config is {pm._proxies}") has_captcha = self._requests_has_captcha(resp.text) @@ -136,51 +133,74 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: if not pm.has_proxy(): self.logger.info("No other connections possible.") if not self.got_403: - self.logger.info("Retrying immediately with another session.") + self.logger.info( + "Retrying immediately with another session." + ) else: - if pm.proxy_mode not in (ProxyMode.LUMINATI, ProxyMode.SCRAPERAPI): - w = random.uniform(60, 2*60) - self.logger.info("Will retry after %.2f seconds (with another session).", w) + if pm.proxy_mode not in ( + ProxyMode.LUMINATI, + ProxyMode.SCRAPERAPI, + ): + w = random.uniform(60, 2 * 60) + self.logger.info( + "Will retry after %.2f seconds (with another session).", + w, + ) time.sleep(w) self._new_session(premium=premium) self.got_403 = True - continue # Retry request within same session + continue # Retry request within same session else: - self.logger.info("We can use another connection... let's try that.") + self.logger.info( + "We can use another connection... let's try that." + ) elif resp.status_code == 302 and resp.has_redirect_location: self.logger.debug("Got a redirect.") pagerequest = resp.headers["location"] else: - self.logger.info("""Response code %d. - Retrying...""", resp.status_code) + self.logger.info( + """Response code %d. + Retrying...""", + resp.status_code, + ) except DOSException: if not pm.has_proxy(): self.logger.info("No other connections possible.") - w = random.uniform(60, 2*60) - self.logger.info("Will retry after %.2f seconds (with the same session).", w) + w = random.uniform(60, 2 * 60) + self.logger.info( + "Will retry after %.2f seconds (with the same session).", w + ) time.sleep(w) continue except (Timeout, TimeoutException) as e: - err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args) + err = f"Timeout Exception {type(e).__name__} while fetching page: {e.args}" self.logger.info(err) - if timeout < 3*self._TIMEOUT: - self.logger.info("Increasing timeout and retrying within same session.") + if timeout < 3 * self._TIMEOUT: + self.logger.info( + "Increasing timeout and retrying within same session." + ) timeout = timeout + self._TIMEOUT continue self.logger.info("Giving up this session.") except Exception as e: - err = "Exception %s while fetching page: %s" % (type(e).__name__, e.args) + err = f"Exception {type(e).__name__} while fetching page: {e.args}" self.logger.info(err) self.logger.info("Retrying with a new session.") tries += 1 try: - session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None)) + session, timeout = pm.get_next_proxy( + num_tries=tries, + old_timeout=timeout, + old_proxy=pm._proxies.get("http", None), + ) except Exception: - self.logger.info("No other secondary connections possible. " - "Using the primary proxy for all requests.") + self.logger.info( + "No other secondary connections possible. " + "Using the primary proxy for all requests." + ) break # If secondary proxy does not work, try again primary proxy. @@ -189,13 +209,11 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: else: raise MaxTriesExceededException("Cannot Fetch from Google Scholar.") - def _set_retries(self, num_retries: int) -> None: - if (num_retries < 0): + if num_retries < 0: raise ValueError("num_retries must not be negative") self._max_retries = num_retries - def _requests_has_captcha(self, text) -> bool: """Tests whether some html text contains a captcha. @@ -205,8 +223,8 @@ def _requests_has_captcha(self, text) -> bool: :rtype: {bool} """ return self._has_captcha( - lambda i : f'id="{i}"' in text, - lambda c : f'class="{c}"' in text, + lambda i: f'id="{i}"' in text, + lambda c: f'class="{c}"' in text, ) def _webdriver_has_captcha(self, premium=True) -> bool: @@ -217,15 +235,15 @@ def _webdriver_has_captcha(self, premium=True) -> bool: """ pm = self.pm1 if premium else self.pm2 return self._has_captcha( - lambda i : len(pm._get_webdriver().find_elements(By.ID, i)) > 0, - lambda c : len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, + lambda i: len(pm._get_webdriver().find_elements(By.ID, i)) > 0, + lambda c: len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, ) def _has_captcha(self, got_id, got_class) -> bool: _CAPTCHA_IDS = [ - "gs_captcha_ccl", # the normal captcha div - "recaptcha", # the form used on full-page captchas - "captcha-form", # another form used on full-page captchas + "gs_captcha_ccl", # the normal captcha div + "recaptcha", # the form used on full-page captchas + "captcha-form", # another form used on full-page captchas ] _DOS_CLASSES = [ "rc-doscaptcha-body", @@ -236,39 +254,36 @@ def _has_captcha(self, got_id, got_class) -> bool: def _get_soup(self, url: str) -> BeautifulSoup: """Return the BeautifulSoup for a page on scholar.google.com""" - html = self._get_page('https://scholar.google.com{0}'.format(url)) - html = html.replace(u'\xa0', u' ') - res = BeautifulSoup(html, 'html.parser') - try: - self.publib = res.find('div', id='gs_res_glb').get('data-sva') - except Exception: - pass + html = self._get_page(f"https://scholar.google.com{url}") + html = html.replace("\xa0", " ") + res = BeautifulSoup(html, "html.parser") + with contextlib.suppress(Exception): + self.publib = res.find("div", id="gs_res_glb").get("data-sva") return res - def search_authors(self, url: str)->Author: + def search_authors(self, url: str) -> Author: """Generator that returns Author objects from the author search page""" soup = self._get_soup(url) author_parser = AuthorParser(self) while True: - rows = soup.find_all('div', 'gsc_1usr') + rows = soup.find_all("div", "gsc_1usr") self.logger.info("Found %d authors", len(rows)) for row in rows: yield author_parser.get_author(row) - cls1 = 'gs_btnPR gs_in_ib gs_btn_half ' - cls2 = 'gs_btn_lsb gs_btn_srt gsc_pgn_pnx' - next_button = soup.find(class_=cls1+cls2) # Can be improved - if next_button and 'disabled' not in next_button.attrs: + cls1 = "gs_btnPR gs_in_ib gs_btn_half " + cls2 = "gs_btn_lsb gs_btn_srt gsc_pgn_pnx" + next_button = soup.find(class_=cls1 + cls2) # Can be improved + if next_button and "disabled" not in next_button.attrs: self.logger.info("Loading next page of authors") - url = next_button['onclick'][17:-1] + url = next_button["onclick"][17:-1] url = codecs.getdecoder("unicode_escape")(url)[0] soup = self._get_soup(url) else: self.logger.info("No more author pages") break - def search_publication(self, url: str, - filled: bool = False) -> PublicationParser: + def search_publication(self, url: str, filled: bool = False) -> PublicationParser: """Search by scholar query and return a single Publication object :param url: the url to be searched at @@ -280,7 +295,10 @@ def search_publication(self, url: str, """ soup = self._get_soup(url) publication_parser = PublicationParser(self) - pub = publication_parser.get_publication(soup.find_all('div', 'gs_or')[0], PublicationSource.PUBLICATION_SEARCH_SNIPPET) + pub = publication_parser.get_publication( + soup.find_all("div", "gs_or")[0], + PublicationSource.PUBLICATION_SEARCH_SNIPPET, + ) if filled: pub = publication_parser.fill(pub) return pub @@ -295,7 +313,13 @@ def search_publications(self, url: str) -> _SearchScholarIterator: """ return _SearchScholarIterator(self, url) - def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0) -> Author: + def search_author_id( + self, + id: str, + filled: bool = False, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author: """Search by author ID and return a Author object :param id: the Google Scholar id of a particular author :type url: str @@ -311,30 +335,39 @@ def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby author_parser = AuthorParser(self) res = author_parser.get_author(id) if filled: - res = author_parser.fill(res, sortby=sortby, publication_limit=publication_limit) + res = author_parser.fill( + res, sortby=sortby, publication_limit=publication_limit + ) else: - res = author_parser.fill(res, sections=['basics'], sortby=sortby, publication_limit=publication_limit) + res = author_parser.fill( + res, + sections=["basics"], + sortby=sortby, + publication_limit=publication_limit, + ) return res def search_organization(self, url: str, fromauthor: bool) -> list: """Generate instiution object from author search page. - if no results are found and `fromuthor` is True, then use the first author from the search - to get institution/organization name. + if no results are found and `fromuthor` is True, then use the first author from the search + to get institution/organization name. """ soup = self._get_soup(url) - rows = soup.find_all('h3', 'gsc_inst_res') + rows = soup.find_all("h3", "gsc_inst_res") if rows: self.logger.info("Found institution") res = [] for row in rows: - res.append({'Organization': row.a.text, 'id': row.a['href'].split('org=', 1)[1]}) + res.append( + {"Organization": row.a.text, "id": row.a["href"].split("org=", 1)[1]} + ) if rows == [] and fromauthor is True: try: auth = next(self.search_authors(url)) authorg = self.search_author_id(auth.id).organization - authorg['fromauthor'] = True + authorg["fromauthor"] = True res.append(authorg) except Exception: res = [] diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 2d2ec6f3..9cc006dd 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -1,21 +1,25 @@ -from typing import Callable -from fp.fp import FreeProxy -import random import logging +import random +import tempfile import time -import requests +from contextlib import contextmanager +from typing import Callable, Optional +from urllib.parse import urlparse + import httpx -import tempfile +import requests import urllib3 - +from deprecated import deprecated +from fp.fp import FreeProxy from selenium import webdriver -from selenium.webdriver.support.wait import WebDriverWait, TimeoutException +from selenium.common.exceptions import ( + UnexpectedAlertPresentException, + WebDriverException, +) from selenium.webdriver.common.by import By -from selenium.common.exceptions import WebDriverException, UnexpectedAlertPresentException from selenium.webdriver.firefox.options import Options as FirefoxOptions -from urllib.parse import urlparse -from contextlib import contextmanager -from deprecated import deprecated +from selenium.webdriver.support.wait import TimeoutException, WebDriverWait + try: import stem.process from stem import Signal @@ -25,10 +29,11 @@ try: from fake_useragent import UserAgent + FAKE_USERAGENT = True except Exception: FAKE_USERAGENT = False - DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36' + DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" from .data_types import ProxyMode @@ -41,10 +46,10 @@ class MaxTriesExceededException(Exception): """Maximum number of tries by scholarly reached""" -class ProxyGenerator(object): +class ProxyGenerator: def __init__(self): # setting up logger - self.logger = logging.getLogger('scholarly') + self.logger = logging.getLogger("scholarly") self._proxy_gen = None # If we use a proxy or Tor, we set this to True @@ -71,7 +76,7 @@ def get_session(self): return self._session def Luminati(self, usr, passwd, proxy_port): - """ Setups a luminati proxy without refreshing capabilities. + """Setups a luminati proxy without refreshing capabilities. :param usr: scholarly username, optional by default None :type usr: string @@ -86,12 +91,14 @@ def Luminati(self, usr, passwd, proxy_port): >>> pg = ProxyGenerator() >>> success = pg.Luminati(usr = foo, passwd = bar, port = 1200) """ - if (usr is not None and passwd is not None and proxy_port is not None): + if usr is not None and passwd is not None and proxy_port is not None: username = usr password = passwd port = proxy_port else: - self.logger.warning("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.") + self.logger.warning( + "Not enough parameters were provided for the Luminati proxy. Reverting to a local connection." + ) return session_id = random.random() proxy = f"http://{username}-session-{session_id}:{password}@zproxy.lum-superproxy.io:{port}" @@ -125,7 +132,11 @@ def SingleProxy(self, http=None, https=None): self.proxy_mode = ProxyMode.SINGLEPROXY self.logger.info("Proxy setup successfully") else: - self.logger.warning("Unable to setup the proxy: http=%s https=%s. Reason unknown." , http, https) + self.logger.warning( + "Unable to setup the proxy: http=%s https=%s. Reason unknown.", + http, + https, + ) return proxy_works def _check_proxy(self, proxies) -> bool: @@ -137,12 +148,13 @@ def _check_proxy(self, proxies) -> bool: """ with requests.Session() as session: # Reformat proxy for requests. Requests and HTTPX use different proxy format. - session.proxies = {'http':proxies['http://'], 'https':proxies['https://']} + session.proxies = {"http": proxies["http://"], "https": proxies["https://"]} try: resp = session.get("http://httpbin.org/ip", timeout=self._TIMEOUT) if resp.status_code == 200: - self.logger.info("Proxy works! IP address: %s", - resp.json()["origin"]) + self.logger.info( + "Proxy works! IP address: %s", resp.json()["origin"] + ) return True elif resp.status_code == 401: self.logger.warning("Incorrect credentials for proxy!") @@ -152,10 +164,16 @@ def _check_proxy(self, proxies) -> bool: except Exception as e: # Failure is common and expected with free proxy. # Do not log at warning level and annoy users. - level = logging.DEBUG if self.proxy_mode is ProxyMode.FREE_PROXIES else logging.WARNING + level = ( + logging.DEBUG + if self.proxy_mode is ProxyMode.FREE_PROXIES + else logging.WARNING + ) self.logger.log(level, "Exception while testing proxy: %s", e) if self.proxy_mode in (ProxyMode.LUMINATI, ProxyMode.SCRAPERAPI): - self.logger.warning("Double check your credentials and try increasing the timeout") + self.logger.warning( + "Double check your credentials and try increasing the timeout" + ) return False @@ -179,7 +197,7 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool: self.logger.info(err) return (False, None) - def _use_proxy(self, http: str, https: str = None) -> bool: + def _use_proxy(self, http: str, https: Optional[str] = None) -> bool: """Allows user to set their own proxy for the connection session. Sets the proxy if it works. @@ -198,16 +216,21 @@ def _use_proxy(self, http: str, https: str = None) -> bool: elif https[:5] not in ("https", "socks"): https = "https://" + https - proxies = {'http://': http, 'https://': https} + proxies = {"http://": http, "https://": https} if self.proxy_mode == ProxyMode.SCRAPERAPI: - r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json() + r = requests.get( + "http://api.scraperapi.com/account", params={"api_key": self._API_KEY} + ).json() if "error" in r: self.logger.warning(r["error"]) self._proxy_works = False else: self._proxy_works = r["requestCount"] < int(r["requestLimit"]) - self.logger.info("Successful ScraperAPI requests %d / %d", - r["requestCount"], r["requestLimit"]) + self.logger.info( + "Successful ScraperAPI requests %d / %d", + r["requestCount"], + r["requestLimit"], + ) else: self._proxy_works = self._check_proxy(proxies) @@ -217,8 +240,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool: return self._proxy_works - @deprecated(version='1.5', reason="Tor methods are deprecated and are not actively tested.") - def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: str): + @deprecated( + version="1.5", reason="Tor methods are deprecated and are not actively tested." + ) + def Tor_External( + self, tor_sock_port: int, tor_control_port: int, tor_password: str + ): """ Setting up Tor Proxy. A tor service should be already running on the system. Otherwise you might want to use Tor_Internal @@ -236,8 +263,10 @@ def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: Note: This method is deprecated since v1.5 """ if stem is None: - raise RuntimeError("Tor methods are not supported with basic version of the package. " - "Please install scholarly[tor] to use this method.") + raise RuntimeError( + "Tor methods are not supported with basic version of the package. " + "Please install scholarly[tor] to use this method." + ) self._TIMEOUT = 10 @@ -259,12 +288,14 @@ def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: "proxy_works": self._proxy_works, "refresh_works": self._can_refresh_tor, "tor_control_port": tor_control_port, - "tor_sock_port": tor_sock_port + "tor_sock_port": tor_sock_port, } - @deprecated(version='1.5', reason="Tor methods are deprecated and are not actively tested") + @deprecated( + version="1.5", reason="Tor methods are deprecated and are not actively tested" + ) def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): - ''' + """ Starts a Tor client running in a scholarly-specific port, together with a scholarly-specific control port. If no arguments are passed for the tor_sock_port and the tor_control_port they are automatically generated in the following ranges - tor_sock_port: (9000, 9500) @@ -282,20 +313,24 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): pg.Tor_Internal(tor_cmd = 'tor') Note: This method is deprecated since v1.5 - ''' + """ if stem is None: - raise RuntimeError("Tor methods are not supported with basic version of the package. " - "Please install scholarly[tor] to use this method.") + raise RuntimeError( + "Tor methods are not supported with basic version of the package. " + "Please install scholarly[tor] to use this method." + ) self.logger.info("Attempting to start owned Tor as the proxy") if tor_cmd is None: - self.logger.info("No tor_cmd argument passed. This should point to the location of Tor executable.") + self.logger.info( + "No tor_cmd argument passed. This should point to the location of Tor executable." + ) return { "proxy_works": False, "refresh_works": False, "tor_control_port": None, - "tor_sock_port": None + "tor_sock_port": None, } if tor_sock_port is None: @@ -312,9 +347,9 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): self._tor_process = stem.process.launch_tor_with_config( tor_cmd=tor_cmd, config={ - 'ControlPort': str(tor_control_port), - 'SocksPort': str(tor_sock_port), - 'DataDirectory': tempfile.mkdtemp() + "ControlPort": str(tor_control_port), + "SocksPort": str(tor_sock_port), + "DataDirectory": tempfile.mkdtemp(), # TODO Perhaps we want to also set a password here }, # take_ownership=True # Taking this out for now, as it seems to cause trouble @@ -324,9 +359,9 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): def _has_captcha(self, got_id, got_class) -> bool: _CAPTCHA_IDS = [ - "gs_captcha_ccl", # the normal captcha div - "recaptcha", # the form used on full-page captchas - "captcha-form", # another form used on full-page captchas + "gs_captcha_ccl", # the normal captcha div + "recaptcha", # the form used on full-page captchas + "captcha-form", # another form used on full-page captchas ] _DOS_CLASSES = [ "rc-doscaptcha-body", @@ -342,8 +377,8 @@ def _webdriver_has_captcha(self) -> bool: :rtype: {bool} """ return self._has_captcha( - lambda i : len(self._get_webdriver().find_elements(By.ID, i)) > 0, - lambda c : len(self._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, + lambda i: len(self._get_webdriver().find_elements(By.ID, i)) > 0, + lambda c: len(self._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, ) def _get_webdriver(self): @@ -366,32 +401,36 @@ def _get_webdriver(self): def _get_chrome_webdriver(self): if self._proxy_works: - webdriver.DesiredCapabilities.CHROME['proxy'] = { - "httpProxy": self._proxies['http://'], - "sslProxy": self._proxies['https://'], - "proxyType": "MANUAL" + webdriver.DesiredCapabilities.CHROME["proxy"] = { + "httpProxy": self._proxies["http://"], + "sslProxy": self._proxies["https://"], + "proxyType": "MANUAL", } options = webdriver.ChromeOptions() - options.add_argument('--headless') - self._webdriver = webdriver.Chrome('chromedriver', options=options) - self._webdriver.get("https://scholar.google.com") # Need to pre-load to set cookies later + options.add_argument("--headless") + self._webdriver = webdriver.Chrome("chromedriver", options=options) + self._webdriver.get( + "https://scholar.google.com" + ) # Need to pre-load to set cookies later return self._webdriver def _get_firefox_webdriver(self): if self._proxy_works: # Redirect webdriver through proxy - webdriver.DesiredCapabilities.FIREFOX['proxy'] = { - "httpProxy": self._proxies['http://'], - "sslProxy": self._proxies['https://'], + webdriver.DesiredCapabilities.FIREFOX["proxy"] = { + "httpProxy": self._proxies["http://"], + "sslProxy": self._proxies["https://"], "proxyType": "MANUAL", } options = FirefoxOptions() - options.add_argument('--headless') + options.add_argument("--headless") self._webdriver = webdriver.Firefox(options=options) - self._webdriver.get("https://scholar.google.com") # Need to pre-load to set cookies later + self._webdriver.get( + "https://scholar.google.com" + ) # Need to pre-load to set cookies later # It might make sense to (pre)set cookies as well, e.g., to set a GSP ID. # However, a limitation of webdriver makes it impossible to set cookies for @@ -406,42 +445,54 @@ def _handle_captcha2(self, url): cur_host = urlparse(self._get_webdriver().current_url).hostname for cookie in self._session.cookies: # Only set cookies matching the current domain, cf. https://github.com/w3c/webdriver/issues/1238 - if cur_host is cookie.domain.lstrip('.'): - self._get_webdriver().add_cookie({ - 'name': cookie.name, - 'value': cookie.value, - 'path': cookie.path, - 'domain':cookie.domain, - }) + if cur_host is cookie.domain.lstrip("."): + self._get_webdriver().add_cookie( + { + "name": cookie.name, + "value": cookie.value, + "path": cookie.path, + "domain": cookie.domain, + } + ) self._get_webdriver().get(url) log_interval = 10 cur = 0 - timeout = 60*60*24*7 # 1 week + timeout = 60 * 60 * 24 * 7 # 1 week while cur < timeout: try: - cur = cur + log_interval # Update before exceptions can happen - WebDriverWait(self._get_webdriver(), log_interval).until_not(lambda drv : self._webdriver_has_captcha()) + cur = cur + log_interval # Update before exceptions can happen + WebDriverWait(self._get_webdriver(), log_interval).until_not( + lambda drv: self._webdriver_has_captcha() + ) break except TimeoutException: - self.logger.info(f"Solving the captcha took already {cur} seconds (of maximum {timeout} s).") + self.logger.info( + f"Solving the captcha took already {cur} seconds (of maximum {timeout} s)." + ) except UnexpectedAlertPresentException as e: # This can apparently happen when reCAPTCHA has hiccups: # "Cannot contact reCAPTCHA. Check your connection and try again." - self.logger.info(f"Unexpected alert while waiting for captcha completion: {e.args}") + self.logger.info( + f"Unexpected alert while waiting for captcha completion: {e.args}" + ) time.sleep(15) except DOSException as e: self.logger.info("Google thinks we are DOSing the captcha.") raise e - except (WebDriverException) as e: + except WebDriverException as e: self.logger.info("Browser seems to be dysfunctional - closed by user?") raise e except Exception as e: # TODO: This exception handler should eventually be removed when # we know the "typical" (non-error) exceptions that can occur. - self.logger.info(f"Unhandled {type(e).__name__} while waiting for captcha completion: {e.args}") + self.logger.info( + f"Unhandled {type(e).__name__} while waiting for captcha completion: {e.args}" + ) else: - raise TimeoutException(f"Could not solve captcha in time (within {timeout} s).") + raise TimeoutException( + f"Could not solve captcha in time (within {timeout} s)." + ) self.logger.info(f"Solved captcha in less than {cur} seconds.") for cookie in self._get_webdriver().get_cookies(): @@ -464,21 +515,21 @@ def _new_session(self, **kwargs): if FAKE_USERAGENT: # Suppress the misleading traceback from UserAgent() - with self._suppress_logger('fake_useragent'): + with self._suppress_logger("fake_useragent"): user_agent = UserAgent().random else: user_agent = DEFAULT_USER_AGENT _HEADERS = { - 'accept-language': 'en-US,en', - 'accept': 'text/html,application/xhtml+xml,application/xml', - 'User-Agent': user_agent, + "accept-language": "en-US,en", + "accept": "text/html,application/xhtml+xml,application/xml", + "User-Agent": user_agent, } # self._session.headers.update(_HEADERS) init_kwargs.update(headers=_HEADERS) if self._proxy_works: - init_kwargs["proxies"] = proxies #.get("http", None) + init_kwargs["proxies"] = proxies # .get("http", None) self._proxies = proxies if self.proxy_mode is ProxyMode.SCRAPERAPI: # SSL Certificate verification must be disabled for @@ -505,7 +556,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120): It takes back the proxies that stopped working and marks it as dirty. """ freeproxy = FreeProxy(rand=False, timeout=timeout) - if not hasattr(self, '_dirty_freeproxies'): + if not hasattr(self, "_dirty_freeproxies"): self._dirty_freeproxies = set() try: all_proxies = freeproxy.get_proxy_list(repeat=False) # free-proxy >= 1.1.0 @@ -514,16 +565,16 @@ def _fp_coroutine(self, timeout=1, wait_time=120): all_proxies.reverse() # Try the older proxies first t1 = time.time() - while (time.time()-t1 < wait_time): + while time.time() - t1 < wait_time: proxy = all_proxies.pop() if not all_proxies: all_proxies = freeproxy.get_proxy_list() if proxy in self._dirty_freeproxies: continue - proxies = {'http://': proxy, 'https://': proxy} + proxies = {"http://": proxy, "https://": proxy} proxy_works = self._check_proxy(proxies) if proxy_works: - dirty_proxy = (yield proxy) + dirty_proxy = yield proxy t1 = time.time() else: dirty_proxy = proxy @@ -565,9 +616,10 @@ def FreeProxies(self, timeout=1, wait_time=120): if n_tries == n_retries: n_dirty = len(self._dirty_freeproxies) self._fp_gen.close() - msg = ("None of the free proxies are working at the moment. " - f"Marked {n_dirty} proxies dirty. Try again after a few minutes." - ) + msg = ( + "None of the free proxies are working at the moment. " + f"Marked {n_dirty} proxies dirty. Try again after a few minutes." + ) raise MaxTriesExceededException(msg) else: return True @@ -595,7 +647,9 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): raise ValueError("ScraperAPI API Key is required.") # Get basic account information. This will NOT be counted towards successful API requests. - r = requests.get("http://api.scraperapi.com/account", params={'api_key': API_KEY}).json() + r = requests.get( + "http://api.scraperapi.com/account", params={"api_key": API_KEY} + ).json() if "error" in r: self.logger.warning(r["error"]) return False @@ -604,8 +658,11 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): self.proxy_mode = ProxyMode.SCRAPERAPI r["requestLimit"] = int(r["requestLimit"]) - self.logger.info("Successful ScraperAPI requests %d / %d", - r["requestCount"], r["requestLimit"]) + self.logger.info( + "Successful ScraperAPI requests %d / %d", + r["requestCount"], + r["requestLimit"], + ) # ScraperAPI documentation recommends setting the timeout to 60 seconds # so it has had a chance to try out all the retries. @@ -624,14 +681,18 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) for _ in range(3): - proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001') + proxy_works = self._use_proxy( + http=f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001" + ) if proxy_works: - proxies = {'http://': f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001",} + proxies = { + "http://": f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001", + } self.logger.info("ScraperAPI proxy setup successfully") self._new_session(verify=False, proxies=proxies) return proxy_works - if (r["requestCount"] >= r["requestLimit"]): + if r["requestCount"] >= r["requestLimit"]: self.logger.warning("ScraperAPI account limit reached.") else: self.logger.warning("ScraperAPI does not seem to work. Reason unknown.") @@ -645,22 +706,22 @@ def _set_proxy_generator(self, gen: Callable[..., str]) -> bool: self._proxy_gen = gen return True - def get_next_proxy(self, num_tries = None, old_timeout = 3, old_proxy=None): + def get_next_proxy(self, num_tries=None, old_timeout=3, old_proxy=None): new_timeout = old_timeout if self._can_refresh_tor: # Check if Tor is running and refresh it self.logger.info("Refreshing Tor ID...") self._refresh_tor_id(self._tor_control_port, self._tor_password) - time.sleep(5) # wait for the refresh to happen - new_timeout = self._TIMEOUT # Reset timeout to default + time.sleep(5) # wait for the refresh to happen + new_timeout = self._TIMEOUT # Reset timeout to default elif self._proxy_gen: - if (num_tries): + if num_tries: self.logger.info("Try #%d failed. Switching proxy.", num_tries) # Try to get another proxy new_proxy = self._proxy_gen(old_proxy) - while (not self._use_proxy(new_proxy)): + while not self._use_proxy(new_proxy): new_proxy = self._proxy_gen(new_proxy) - new_timeout = self._TIMEOUT # Reset timeout to default + new_timeout = self._TIMEOUT # Reset timeout to default self._new_session() else: self._new_session() @@ -672,8 +733,7 @@ def get_next_proxy(self, num_tries = None, old_timeout = 3, old_proxy=None): @staticmethod @contextmanager def _suppress_logger(loggerName: str, level=logging.CRITICAL): - """Temporarily suppress logging output from a specific logger. - """ + """Temporarily suppress logging output from a specific logger.""" logger = logging.getLogger(loggerName) original_level = logger.getEffectiveLevel() logger.setLevel(level) diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py index 4f64f516..df4f9960 100644 --- a/scholarly/_scholarly.py +++ b/scholarly/_scholarly.py @@ -1,28 +1,39 @@ """scholarly.py""" -import requests -import re -import os + import copy import csv -import pprint import datetime +import os +import pprint import re -from typing import Dict, List, Union +from typing import Optional, Union + +import requests +from dotenv import find_dotenv, load_dotenv + from ._navigator import Navigator from ._proxy_generator import ProxyGenerator -from dotenv import find_dotenv, load_dotenv from .author_parser import AuthorParser +from .data_types import ( + Author, + AuthorSource, + CitesPerYear, + Journal, + Publication, + PublicationSource, +) from .publication_parser import PublicationParser, _SearchScholarIterator -from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource -_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}' -_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}' -_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}' +_AUTHSEARCH = "/citations?hl=en&view_op=search_authors&mauthors={0}" +_KEYWORDSEARCH = "/citations?hl=en&view_op=search_authors&mauthors=label:{0}" +_KEYWORDSEARCHBASE = "/citations?hl=en&view_op=search_authors&mauthors={}" _KEYWORDSEARCH_PATTERN = "[-: #(),;]+" # Unallowed characters in the keywords. -_PUBSEARCH = '/scholar?hl=en&q={0}' -_CITEDBYSEARCH = '/scholar?hl=en&cites={0}' +_PUBSEARCH = "/scholar?hl=en&q={0}" +_CITEDBYSEARCH = "/scholar?hl=en&cites={0}" _ORGSEARCH = "/citations?view_op=view_org&hl=en&org={0}" -_MANDATES_URL = "https://scholar.google.com/citations?view_op=mandates_leaderboard_csv&hl=en" +_MANDATES_URL = ( + "https://scholar.google.com/citations?view_op=mandates_leaderboard_csv&hl=en" +) class _Scholarly: @@ -41,7 +52,7 @@ def journal_categories(self): self._journal_categories = self.get_journal_categories() return self._journal_categories - def set_retries(self, num_retries: int)->None: + def set_retries(self, num_retries: int) -> None: """Sets the number of retries in case of errors :param num_retries: the number of retries @@ -50,8 +61,11 @@ def set_retries(self, num_retries: int)->None: return self.__nav._set_retries(num_retries) - def use_proxy(self, proxy_generator: ProxyGenerator, - secondary_proxy_generator: ProxyGenerator = None) -> None: + def use_proxy( + self, + proxy_generator: ProxyGenerator, + secondary_proxy_generator: ProxyGenerator = None, + ) -> None: """Select which proxy method to use. See the available ProxyGenerator methods. @@ -77,7 +91,6 @@ def use_proxy(self, proxy_generator: ProxyGenerator, """ self.__nav.use_proxy(proxy_generator, secondary_proxy_generator) - def set_logger(self, enable: bool): """Enable or disable the logger for google scholar. Enabled by default @@ -88,12 +101,17 @@ def set_timeout(self, timeout: int): """Set timeout period in seconds for scholarly""" self.__nav.set_timeout(timeout) - def search_pubs(self, - query: str, patents: bool = True, - citations: bool = True, year_low: int = None, - year_high: int = None, sort_by: str = "relevance", - include_last_year: str = "abstracts", - start_index: int = 0)->_SearchScholarIterator: + def search_pubs( + self, + query: str, + patents: bool = True, + citations: bool = True, + year_low: Optional[int] = None, + year_high: Optional[int] = None, + sort_by: str = "relevance", + include_last_year: str = "abstracts", + start_index: int = 0, + ) -> _SearchScholarIterator: """Searches by query and returns a generator of Publication objects :param query: terms to be searched @@ -154,9 +172,16 @@ def search_pubs(self, 'url_scholarbib': '/scholar?q=info:K8ZpoI6hZNoJ:scholar.google.com/&output=cite&scirp=0&hl=en'} """ - url = self._construct_url(_PUBSEARCH.format(requests.utils.quote(query)), patents=patents, - citations=citations, year_low=year_low, year_high=year_high, - sort_by=sort_by, include_last_year=include_last_year, start_index=start_index) + url = self._construct_url( + _PUBSEARCH.format(requests.utils.quote(query)), + patents=patents, + citations=citations, + year_low=year_low, + year_high=year_high, + sort_by=sort_by, + include_last_year=include_last_year, + start_index=start_index, + ) return self.__nav.search_publications(url) def search_citedby(self, publication_id: Union[int, str], **kwargs): @@ -170,7 +195,9 @@ def search_citedby(self, publication_id: Union[int, str], **kwargs): url = self._construct_url(_CITEDBYSEARCH.format(str(publication_id)), **kwargs) return self.__nav.search_publications(url) - def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationParser: + def search_single_pub( + self, pub_title: str, filled: bool = False + ) -> PublicationParser: """Search by scholar query and return a single Publication container object :param pub_title: Title of the publication to search @@ -208,7 +235,13 @@ def search_author(self, name: str): url = _AUTHSEARCH.format(requests.utils.quote(name)) return self.__nav.search_authors(url) - def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_limit: int = 0) -> Author or Publication: + def fill( + self, + object: dict, + sections=None, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author or Publication: """Fills the object according to its type. If the container type is Author it will fill the additional author fields If it is Publication it will fill it accordingly. @@ -228,24 +261,26 @@ def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_l the publication entries are also marked whether they satisfy public access mandates or not. """ - if object['container_type'] == "Author": + if sections is None: + sections = [] + if object["container_type"] == "Author": author_parser = AuthorParser(self.__nav) object = author_parser.fill(object, sections, sortby, publication_limit) if object is False: raise ValueError("Incorrect input") - elif object['container_type'] == "Publication": + elif object["container_type"] == "Publication": publication_parser = PublicationParser(self.__nav) object = publication_parser.fill(object) return object - def bibtex(self, object: Publication)->str: + def bibtex(self, object: Publication) -> str: """Returns a bibtex entry for a publication that has either Scholar source or citation source :param object: The Publication object for the bibtex exportation :type object: Publication """ - if object['container_type'] == "Publication": + if object["container_type"] == "Publication": publication_parser = PublicationParser(self.__nav) return publication_parser.bibtex(object) else: @@ -271,7 +306,7 @@ def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end): return years - def citedby(self, object: Publication)->_SearchScholarIterator: + def citedby(self, object: Publication) -> _SearchScholarIterator: """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. @@ -279,47 +314,71 @@ def citedby(self, object: Publication)->_SearchScholarIterator: :type object: Publication """ - if object['container_type'] != "Publication": + if object["container_type"] != "Publication": self.logger.warning("Object not supported for bibtex exportation") return if object["num_citations"] <= 1000: return PublicationParser(self.__nav).citedby(object) - self.logger.debug("Since the paper titled %s has %d citations (>1000), " - "fetching it on an annual basis.", object["bib"]["title"], object["num_citations"]) + self.logger.debug( + "Since the paper titled %s has %d citations (>1000), " + "fetching it on an annual basis.", + object["bib"]["title"], + object["num_citations"], + ) year_end = int(datetime.date.today().year) if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: self.fill(object) - years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end) + years = self._bin_citations_by_year( + object.get("cites_per_year", {}), year_end + ) else: try: year_low = int(object["bib"]["pub_year"]) except KeyError: - self.logger.warning("Unknown publication year for paper %s, may result in incorrect number " - "of citedby papers.", object["bib"]["title"]) + self.logger.warning( + "Unknown publication year for paper %s, may result in incorrect number " + "of citedby papers.", + object["bib"]["title"], + ) return PublicationParser(self.__nav).citedby(object) # Go one year at a time in decreasing order - years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1)) + years = zip( + range(year_end, year_low - 1, -1), range(year_end, year_low - 1, -1) + ) - return self._citedby_long(object,years) + return self._citedby_long(object, years) def _citedby_long(self, object: Publication, years): # Extract cites_id. Note: There could be multiple ones, separated by commas. - m = re.search("cites=[\d+,]*", object["citedby_url"]) + m = re.search(r"cites=[\d+,]*", object["citedby_url"]) pub_id = m.group()[6:] for y_hi, y_lo in years: - sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi) + sub_citations = self.search_citedby( + publication_id=pub_id, year_low=y_lo, year_high=y_hi + ) if sub_citations.total_results and (sub_citations.total_results > 1000): - self.logger.warn("The paper titled %s has %d citations in the year %d. " - "Due to the limitation in Google Scholar, fetching only 1000 results " - "from that year.", object["bib"]["title"], sub_citations.total_results, y_lo) + self.logger.warn( + "The paper titled %s has %d citations in the year %d. " + "Due to the limitation in Google Scholar, fetching only 1000 results " + "from that year.", + object["bib"]["title"], + sub_citations.total_results, + y_lo, + ) yield from sub_citations - def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author: + def search_author_id( + self, + id: str, + filled: bool = False, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author: """Search by author id and return a single Author object :param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. :type sortby: string @@ -383,7 +442,7 @@ def search_keyword(self, keyword: str): url = _KEYWORDSEARCH.format(requests.utils.quote(reg_keyword)) return self.__nav.search_authors(url) - def search_keywords(self, keywords: List[str]): + def search_keywords(self, keywords: list[str]): """Search by keywords and return a generator of Author objects :param keywords: a list of keywords to be searched @@ -414,13 +473,17 @@ def search_keywords(self, keywords: List[str]): 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'} """ - reg_keywords = (re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords) - formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in reg_keywords] - formated_keywords = '+'.join(formated_keywords) + reg_keywords = ( + re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords + ) + formated_keywords = [ + "label:" + requests.utils.quote(keyword) for keyword in reg_keywords + ] + formated_keywords = "+".join(formated_keywords) url = _KEYWORDSEARCHBASE.format(formated_keywords) return self.__nav.search_authors(url) - def search_pubs_custom_url(self, url: str)->_SearchScholarIterator: + def search_pubs_custom_url(self, url: str) -> _SearchScholarIterator: """Search by custom URL and return a generator of Publication objects URL should be of the form '/scholar?q=...' @@ -433,7 +496,7 @@ def search_pubs_custom_url(self, url: str)->_SearchScholarIterator: """ return self.__nav.search_publications(url) - def search_author_custom_url(self, url: str)->Author: + def search_author_custom_url(self, url: str) -> Author: """Search by custom URL and return a generator of Author objects URL should be of the form '/citation?q=...' @@ -442,57 +505,59 @@ def search_author_custom_url(self, url: str)->Author: """ return self.__nav.search_authors(url) - def get_related_articles(self, object: Publication)->_SearchScholarIterator: + def get_related_articles(self, object: Publication) -> _SearchScholarIterator: """ Search google scholar for related articles to a specific publication. :param object: Publication object used to get the related articles :type object: Publication """ - if object['container_type'] != 'Publication': + if object["container_type"] != "Publication": self.logger.warning("Not a publication object") return - if object['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: - if 'url_related_articles' not in object.keys(): + if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + if "url_related_articles" not in object: object = self.fill(object) - return self.__nav.search_publications(object['url_related_articles']) - elif object['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: - return self.__nav.search_publications(object['url_related_articles']) + return self.__nav.search_publications(object["url_related_articles"]) + elif object["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + return self.__nav.search_publications(object["url_related_articles"]) - def pprint(self, object: Author or Publication)->None: + def pprint(self, object: Author or Publication) -> None: """Pretty print an Author or Publication container object :param object: Publication or Author container object :type object: Author or Publication """ - if 'container_type' not in object: + if "container_type" not in object: self.logger.warning("Not a scholarly container object") return to_print = copy.deepcopy(object) - if to_print['container_type'] == 'Publication': - to_print['source'] = PublicationSource(to_print['source']).name - elif to_print['container_type'] == 'Author': + if to_print["container_type"] == "Publication": + to_print["source"] = PublicationSource(to_print["source"]).name + elif to_print["container_type"] == "Author": parser = AuthorParser(self.__nav) - to_print['source'] = AuthorSource(to_print['source']).name - if parser._sections == to_print['filled']: - to_print['filled'] = True + to_print["source"] = AuthorSource(to_print["source"]).name + if parser._sections == to_print["filled"]: + to_print["filled"] = True else: - to_print['filled'] = False - - if 'coauthors' in to_print: - for coauthor in to_print['coauthors']: - coauthor['filled'] = False - del coauthor['container_type'] - coauthor['source'] = AuthorSource(coauthor['source']).name - - if 'publications' in to_print: - for publication in to_print['publications']: - publication['source'] = PublicationSource(publication['source']).name - del publication['container_type'] - - del to_print['container_type'] + to_print["filled"] = False + + if "coauthors" in to_print: + for coauthor in to_print["coauthors"]: + coauthor["filled"] = False + del coauthor["container_type"] + coauthor["source"] = AuthorSource(coauthor["source"]).name + + if "publications" in to_print: + for publication in to_print["publications"]: + publication["source"] = PublicationSource( + publication["source"] + ).name + del publication["container_type"] + + del to_print["container_type"] print(pprint.pformat(to_print).encode("utf-8")) def search_org(self, name: str, fromauthor: bool = False) -> list: @@ -533,15 +598,18 @@ def search_author_by_organization(self, organization_id: int): url = _ORGSEARCH.format(organization_id) return self.__nav.search_authors(url) - def download_mandates_csv(self, filename: str, overwrite: bool = False, - include_links: bool =True): + def download_mandates_csv( + self, filename: str, overwrite: bool = False, include_links: bool = True + ): """ Download the CSV file of the current mandates. """ if (not overwrite) and os.path.exists(filename): - raise ValueError(f"{filename} already exists. Either provide a " - "different filename or allow overwriting by " - "setting overwrite=True") + raise ValueError( + f"{filename} already exists. Either provide a " + "different filename or allow overwriting by " + "setting overwrite=True" + ) text = self.__nav._get_page(_MANDATES_URL, premium=False) if include_links: soup = self.__nav._get_soup("/citations?hl=en&view_op=mandates_leaderboard") @@ -550,7 +618,7 @@ def download_mandates_csv(self, filename: str, overwrite: bool = False, cached = agency.find("span", class_="gs_a").a["href"] name = agency.a.text if name != "cached": - policy = agency.a['href'] + policy = agency.a["href"] else: name = agency.text[:-10] policy = "" @@ -560,39 +628,49 @@ def download_mandates_csv(self, filename: str, overwrite: bool = False, else: text = text.replace(f"{name},", f"{name},{policy},{cached},") try: - with open(filename, 'w') as f: + with open(filename, "w") as f: f.write(text) - except IOError: + except OSError: self.logger.error("Error writing mandates as %s", filename) finally: return text # TODO: Make it a public method in v1.6 - def _construct_url(self, baseurl: str, patents: bool = True, - citations: bool = True, year_low: int = None, - year_high: int = None, sort_by: str = "relevance", - include_last_year: str = "abstracts", - start_index: int = 0)-> str: + def _construct_url( + self, + baseurl: str, + patents: bool = True, + citations: bool = True, + year_low: Optional[int] = None, + year_high: Optional[int] = None, + sort_by: str = "relevance", + include_last_year: str = "abstracts", + start_index: int = 0, + ) -> str: """Construct URL from requested parameters.""" url = baseurl - yr_lo = '&as_ylo={0}'.format(year_low) if year_low is not None else '' - yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else '' - citations = '&as_vis={0}'.format(1 - int(citations)) - patents = '&as_sdt={0},33'.format(1 - int(patents)) - sortby = '' - start = '&start={0}'.format(start_index) if start_index > 0 else '' + yr_lo = f"&as_ylo={year_low}" if year_low is not None else "" + yr_hi = f"&as_yhi={year_high}" if year_high is not None else "" + citations = f"&as_vis={1 - int(citations)}" + patents = f"&as_sdt={1 - int(patents)},33" + sortby = "" + start = f"&start={start_index}" if start_index > 0 else "" if sort_by == "date": if include_last_year == "abstracts": - sortby = '&scisbd=1' + sortby = "&scisbd=1" elif include_last_year == "everything": - sortby = '&scisbd=2' + sortby = "&scisbd=2" else: - self.logger.debug("Invalid option for 'include_last_year', available options: 'everything', 'abstracts'") + self.logger.debug( + "Invalid option for 'include_last_year', available options: 'everything', 'abstracts'" + ) return elif sort_by != "relevance": - self.logger.debug("Invalid option for 'sort_by', available options: 'relevance', 'date'") + self.logger.debug( + "Invalid option for 'sort_by', available options: 'relevance', 'date'" + ) return # improve str below @@ -605,26 +683,30 @@ def get_journal_categories(self): soup = self.__nav._get_soup("/citations?view_op=top_venues&hl=en&vq=en") categories = {} for category in soup.find_all("a", class_="gs_md_li"): - if not "vq=" in category['href']: + if "vq=" not in category["href"]: continue - vq = category['href'].split("&vq=")[1] + vq = category["href"].split("&vq=")[1] categories[category.text] = {} categories[category.text][None] = vq for category in categories: vq = categories[category][None] - if vq=="en": + if vq == "en": continue soup = self.__nav._get_soup(f"/citations?view_op=top_venues&hl=en&vq={vq}") for subcategory in soup.find_all("a", class_="gs_md_li"): - if not f"&vq={vq}_" in subcategory['href']: + if f"&vq={vq}_" not in subcategory["href"]: continue - categories[category][subcategory.text] = subcategory['href'].split("&vq=")[1] + categories[category][subcategory.text] = subcategory["href"].split( + "&vq=" + )[1] - #print(categories) + # print(categories) return categories - def get_journals(self, category='English', subcategory=None, include_comments: bool = False) -> Dict[int, Journal]: + def get_journals( + self, category="English", subcategory=None, include_comments: bool = False + ) -> dict[int, Journal]: try: cat = self.journal_categories[category] try: @@ -637,46 +719,64 @@ def get_journals(self, category='English', subcategory=None, include_comments: b h5indices = soup.find_all("a", class_="gs_ibl gsc_mp_anchor") h5medians = soup.find_all("span", class_="gs_ibl") - - #import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() result = {} - for rank, name, h5index, h5median in zip(ranks, names, h5indices, h5medians): - url_citations = h5index['href'] + for rank, name, h5index, h5median in zip( + ranks, names, h5indices, h5medians + ): + url_citations = h5index["href"] comment = "" if include_comments: soup = self.__nav._get_soup(url_citations) try: - for cmt in soup.find_all('ul', class_='gsc_mlhd_list')[1].find_all('li'): - comment += cmt.text+"; " + for cmt in soup.find_all("ul", class_="gsc_mlhd_list")[ + 1 + ].find_all("li"): + comment += cmt.text + "; " except IndexError: pass - result[int(rank.text[:-1])] = Journal(name=name.text, - h5_index=int(h5index.text), - h5_median=int(h5median.text), - url_citations=url_citations, - comment=comment - ) - #print(result) + result[int(rank.text[:-1])] = Journal( + name=name.text, + h5_index=int(h5index.text), + h5_median=int(h5median.text), + url_citations=url_citations, + comment=comment, + ) + # print(result) return result except KeyError: - raise ValueError("Invalid subcategory: %s for %s. Choose one from %s" % (subcategory, category, cat.keys())) + raise ValueError( + f"Invalid subcategory: {subcategory} for {category}. Choose one from {cat.keys()}" + ) except KeyError: - raise ValueError("Invalid category: %s. Choose one from %s", category, self.journal_categories.keys()) - - def save_journals_csv(self, filename, category="English", subcategory=None, include_comments=False): + raise ValueError( + "Invalid category: %s. Choose one from %s", + category, + self.journal_categories.keys(), + ) + + def save_journals_csv( + self, filename, category="English", subcategory=None, include_comments=False + ): """ Save a list of journals to a file in CSV format. """ journals = self.get_journals(category, subcategory, include_comments) try: - with open(filename, 'w') as f: + with open(filename, "w") as f: csv_writer = csv.writer(f) - header = ['Publication', 'h5-index', 'h5-median'] + ['Comment']*include_comments + header = ["Publication", "h5-index", "h5-median"] + [ + "Comment" + ] * include_comments csv_writer.writerow(header) - for rank, journal in journals.items(): - row = [journal['name'], journal['h5_index'], journal['h5_median']] + [journal.get('comment', '')]*include_comments + for journal in journals.values(): + row = [ + journal["name"], + journal["h5_index"], + journal["h5_median"], + ] + [journal.get("comment", "")] * include_comments csv_writer.writerow(row) - except IOError: + except OSError: self.logger.error("Error writing journals as %s", filename) finally: return journals diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py index 4516b807..25c908f1 100644 --- a/scholarly/author_parser.py +++ b/scholarly/author_parser.py @@ -1,14 +1,16 @@ -from .publication_parser import PublicationParser -import re -from .data_types import Author, AuthorSource, PublicationSource, PublicAccess import codecs +import re +from typing import Optional + +from .data_types import Author, AuthorSource, PublicAccess, PublicationSource +from .publication_parser import PublicationParser -_CITATIONAUTHRE = r'user=([\w-]*)' -_HOST = 'https://scholar.google.com{0}' +_CITATIONAUTHRE = r"user=([\w-]*)" +_HOST = "https://scholar.google.com{0}" _PAGESIZE = 100 -_EMAILAUTHORRE = r'Verified email at ' -_CITATIONAUTH = '/citations?hl=en&user={0}' -_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}' +_EMAILAUTHORRE = r"Verified email at " +_CITATIONAUTH = "/citations?hl=en&user={0}" +_COAUTH = "/citations?view_op=list_colleagues&hl=en&user={0}" _MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}" @@ -17,181 +19,192 @@ class AuthorParser: def __init__(self, nav): self.nav = nav - self._sections = ['basics', - 'indices', - 'counts', - 'coauthors', - 'publications', - 'public_access'] - - def get_author(self, __data)->Author: - """ Fills the information for an author container - """ - author: Author = {'container_type': 'Author'} - author['filled'] = [] + self._sections = [ + "basics", + "indices", + "counts", + "coauthors", + "publications", + "public_access", + ] + + def get_author(self, __data) -> Author: + """Fills the information for an author container""" + author: Author = {"container_type": "Author"} + author["filled"] = [] if isinstance(__data, str): - author['scholar_id'] = __data - author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE + author["scholar_id"] = __data + author["source"] = AuthorSource.AUTHOR_PROFILE_PAGE else: - author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS - author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0] + author["source"] = AuthorSource.SEARCH_AUTHOR_SNIPPETS + author["scholar_id"] = re.findall(_CITATIONAUTHRE, __data("a")[0]["href"])[ + 0 + ] - pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id']) - author['url_picture'] = _HOST.format(pic) + pic = "/citations?view_op=medium_photo&user={}".format(author["scholar_id"]) + author["url_picture"] = _HOST.format(pic) - name_class = self._find_tag_class_name(__data, 'h3', 'name') - author['name'] = __data.find('h3', class_=name_class).text + name_class = self._find_tag_class_name(__data, "h3", "name") + author["name"] = __data.find("h3", class_=name_class).text - aff_class = self._find_tag_class_name(__data, 'div', 'aff') - affiliation = __data.find('div', class_=aff_class) + aff_class = self._find_tag_class_name(__data, "div", "aff") + affiliation = __data.find("div", class_=aff_class) if affiliation: - author['affiliation'] = affiliation.text + author["affiliation"] = affiliation.text - email_class = self._find_tag_class_name(__data, 'div', 'eml') - email = __data.find('div', class_=email_class) + email_class = self._find_tag_class_name(__data, "div", "eml") + email = __data.find("div", class_=email_class) if email: - author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text) + author["email_domain"] = re.sub(_EMAILAUTHORRE, r"@", email.text) - int_class = self._find_tag_class_name(__data, 'a', 'one_int') + int_class = self._find_tag_class_name(__data, "a", "one_int") if int_class: - interests = __data.find_all('a', class_=int_class) - author['interests'] = [i.text.strip() for i in interests] + interests = __data.find_all("a", class_=int_class) + author["interests"] = [i.text.strip() for i in interests] else: - author['interests'] = [] + author["interests"] = [] - citedby_class = self._find_tag_class_name(__data, 'div', 'cby') - citedby = __data.find('div', class_=citedby_class) - if citedby and citedby.text != '': - author['citedby'] = int(citedby.text[9:]) + citedby_class = self._find_tag_class_name(__data, "div", "cby") + citedby = __data.find("div", class_=citedby_class) + if citedby and citedby.text != "": + author["citedby"] = int(citedby.text[9:]) return author - def _find_tag_class_name(self, __data, tag, text): elements = __data.find_all(tag) for element in elements: - if 'class' in element.attrs and text in element.attrs['class'][0]: - return element.attrs['class'][0] + if "class" in element.attrs and text in element.attrs["class"][0]: + return element.attrs["class"][0] def _fill_basics(self, soup, author): - author['name'] = soup.find('div', id='gsc_prf_in').text - if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: - res = soup.find('img', id='gsc_prf_pup-img') - if res is not None: - if "avatar_scholar" not in res['src']: - author['url_picture'] = res['src'] - elif author['source'] == AuthorSource.CO_AUTHORS_LIST: - picture = soup.find('img', id="gsc_prf_pup-img").get('src') + author["name"] = soup.find("div", id="gsc_prf_in").text + if author["source"] == AuthorSource.AUTHOR_PROFILE_PAGE: + res = soup.find("img", id="gsc_prf_pup-img") + if res is not None and "avatar_scholar" not in res["src"]: + author["url_picture"] = res["src"] + elif author["source"] == AuthorSource.CO_AUTHORS_LIST: + picture = soup.find("img", id="gsc_prf_pup-img").get("src") if "avatar_scholar" in picture: picture = _HOST.format(picture) - author['url_picture'] = picture + author["url_picture"] = picture - affiliation = soup.find('div', class_='gsc_prf_il') - author['affiliation'] = affiliation.text - affiliation_link = affiliation.find('a') + affiliation = soup.find("div", class_="gsc_prf_il") + author["affiliation"] = affiliation.text + affiliation_link = affiliation.find("a") if affiliation_link: - author['organization'] = int(affiliation_link.get('href').split("org=")[-1]) - author['interests'] = [i.text.strip() for i in - soup.find_all('a', class_='gsc_prf_inta')] - email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il") - if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: + author["organization"] = int(affiliation_link.get("href").split("org=")[-1]) + author["interests"] = [ + i.text.strip() for i in soup.find_all("a", class_="gsc_prf_inta") + ] + email = soup.find("div", id="gsc_prf_ivh", class_="gsc_prf_il") + if author["source"] == AuthorSource.AUTHOR_PROFILE_PAGE: if email.text != "No verified email": - author['email_domain'] = '@'+email.text.split(" ")[3] - homepage = email.find('a', class_="gsc_prf_ila") + author["email_domain"] = "@" + email.text.split(" ")[3] + homepage = email.find("a", class_="gsc_prf_ila") if homepage: - author['homepage'] = homepage.get('href') + author["homepage"] = homepage.get("href") - index = soup.find_all('td', class_='gsc_rsb_std') + index = soup.find_all("td", class_="gsc_rsb_std") if index: - author['citedby'] = int(index[0].text) + author["citedby"] = int(index[0].text) def _fill_indices(self, soup, author): - index = soup.find_all('td', class_='gsc_rsb_std') + index = soup.find_all("td", class_="gsc_rsb_std") if index: - author['citedby'] = int(index[0].text) - author['citedby5y'] = int(index[1].text) - author['hindex'] = int(index[2].text) - author['hindex5y'] = int(index[3].text) - author['i10index'] = int(index[4].text) - author['i10index5y'] = int(index[5].text) + author["citedby"] = int(index[0].text) + author["citedby5y"] = int(index[1].text) + author["hindex"] = int(index[2].text) + author["hindex5y"] = int(index[3].text) + author["i10index"] = int(index[4].text) + author["i10index5y"] = int(index[5].text) else: - author['hindex'] = 0 - author['hindex5y'] = 0 - author['i10index'] = 0 - author['i10index5y'] = 0 + author["hindex"] = 0 + author["hindex5y"] = 0 + author["i10index"] = 0 + author["i10index5y"] = 0 def _fill_counts(self, soup, author): - years = [int(y.text) - for y in soup.find_all('span', class_='gsc_g_t')] + years = [int(y.text) for y in soup.find_all("span", class_="gsc_g_t")] - cites = [0]*len(years) - for c in soup.find_all('a', class_='gsc_g_a'): - i = int(c['style'].split(':')[-1]) - cites[-i] = int(c.find('span', class_='gsc_g_al').text) + cites = [0] * len(years) + for c in soup.find_all("a", class_="gsc_g_a"): + i = int(c["style"].split(":")[-1]) + cites[-i] = int(c.find("span", class_="gsc_g_al").text) - author['cites_per_year'] = dict(zip(years, cites)) + author["cites_per_year"] = dict(zip(years, cites)) def _fill_public_access(self, soup, author): - available = soup.find('div', class_='gsc_rsb_m_a') - not_available = soup.find('div', class_='gsc_rsb_m_na') + available = soup.find("div", class_="gsc_rsb_m_a") + not_available = soup.find("div", class_="gsc_rsb_m_na") n_available, n_not_available = 0, 0 if available: n_available = int(re.sub("[.,]", "", available.text.split(" ")[0])) if not_available: n_not_available = int(re.sub("[.,]", "", not_available.text.split(" ")[0])) - author["public_access"] = PublicAccess(available=n_available, - not_available=n_not_available) + author["public_access"] = PublicAccess( + available=n_available, not_available=n_not_available + ) - if 'publications' not in author['filled']: + if "publications" not in author["filled"]: return # Make a dictionary mapping to the publications - publications = {pub['author_pub_id']:pub for pub in author['publications']} - soup = self.nav._get_soup(_MANDATES.format(author['scholar_id'], _PAGESIZE)) + publications = {pub["author_pub_id"]: pub for pub in author["publications"]} + soup = self.nav._get_soup(_MANDATES.format(author["scholar_id"], _PAGESIZE)) while True: - rows = soup.find_all('div', 'gsc_mnd_sec_na') + rows = soup.find_all("div", "gsc_mnd_sec_na") if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gsc_mnd_link_font'): - author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", - row['data-href'])[0] + for row in rows[0].find_all("a", "gsc_mnd_art_rvw gsc_mnd_link_font"): + author_pub_id = re.findall( + r"citation_for_view=([\w:-]*)", row["data-href"] + )[0] publications[author_pub_id]["public_access"] = False - rows = soup.find_all('div', 'gsc_mnd_sec_avl') + rows = soup.find_all("div", "gsc_mnd_sec_avl") if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gsc_mnd_link_font'): - author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", - row['data-href'])[0] + for row in rows[0].find_all("a", "gsc_mnd_art_rvw gsc_mnd_link_font"): + author_pub_id = re.findall( + r"citation_for_view=([\w:-]*)", row["data-href"] + )[0] publications[author_pub_id]["public_access"] = True next_button = soup.find(class_="gs_btnPR") if next_button and "disabled" not in next_button.attrs: - url = next_button['onclick'][17:-1] + url = next_button["onclick"][17:-1] url = codecs.getdecoder("unicode_escape")(url)[0] soup = self.nav._get_soup(url) else: break - - def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''): - author['publications'] = list() + def _fill_publications( + self, soup, author, publication_limit: int = 0, sortby_str: str = "" + ): + author["publications"] = list() pubstart = 0 - url_citations = _CITATIONAUTH.format(author['scholar_id']) + url_citations = _CITATIONAUTH.format(author["scholar_id"]) url_citations += sortby_str pub_parser = PublicationParser(self.nav) flag = False while True: - for row in soup.find_all('tr', class_='gsc_a_tr'): - new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY) - author['publications'].append(new_pub) - if (publication_limit) and (len(author['publications']) >= publication_limit): + for row in soup.find_all("tr", class_="gsc_a_tr"): + new_pub = pub_parser.get_publication( + row, PublicationSource.AUTHOR_PUBLICATION_ENTRY + ) + author["publications"].append(new_pub) + if (publication_limit) and ( + len(author["publications"]) >= publication_limit + ): flag = True break - if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag: + if ( + "disabled" not in soup.find("button", id="gsc_bpf_more").attrs + and not flag + ): pubstart += _PAGESIZE - url = '{0}&cstart={1}&pagesize={2}'.format( - url_citations, pubstart, _PAGESIZE) + url = f"{url_citations}&cstart={pubstart}&pagesize={_PAGESIZE}" soup = self.nav._get_soup(url) else: break @@ -207,15 +220,16 @@ def _get_coauthors_short(self, soup): ----- This method is to be called by _fill_coauthors method. """ - coauthors = soup.find_all('span', class_='gsc_rsb_a_desc') - coauthor_ids = [re.findall(_CITATIONAUTHRE, - coauth('a')[0].get('href'))[0] - for coauth in coauthors] + coauthors = soup.find_all("span", class_="gsc_rsb_a_desc") + coauthor_ids = [ + re.findall(_CITATIONAUTHRE, coauth("a")[0].get("href"))[0] + for coauth in coauthors + ] - coauthor_names = [coauth.find(tabindex="-1").text - for coauth in coauthors] - coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text - for coauth in coauthors] + coauthor_names = [coauth.find(tabindex="-1").text for coauth in coauthors] + coauthor_affils = [ + coauth.find(class_="gsc_rsb_a_ext").text for coauth in coauthors + ] return coauthor_ids, coauthor_names, coauthor_affils @@ -229,25 +243,25 @@ def _get_coauthors_long(self, author): ----- This method is to be called by _fill_coauthors method. """ - soup = self.nav._get_soup(_COAUTH.format(author['scholar_id'])) - coauthors = soup.find_all('div', 'gs_ai gs_scl') - coauthor_ids = [re.findall(_CITATIONAUTHRE, - coauth('a')[0].get('href'))[0] - for coauth in coauthors] + soup = self.nav._get_soup(_COAUTH.format(author["scholar_id"])) + coauthors = soup.find_all("div", "gs_ai gs_scl") + coauthor_ids = [ + re.findall(_CITATIONAUTHRE, coauth("a")[0].get("href"))[0] + for coauth in coauthors + ] coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors] - coauthor_affils = [coauth.find(class_="gs_ai_aff").text - for coauth in coauthors] + coauthor_affils = [coauth.find(class_="gs_ai_aff").text for coauth in coauthors] return coauthor_ids, coauthor_names, coauthor_affils def _fill_coauthors(self, soup, author): # If "View All" is not found, scrape the page for coauthors - if not soup.find_all('button', id='gsc_coauth_opn'): + if not soup.find_all("button", id="gsc_coauth_opn"): coauthor_info = self._get_coauthors_short(soup) else: - # If "View All" is found, try opening the dialog box. - # If geckodriver is not installed, resort to a short list and warn. + # If "View All" is found, try opening the dialog box. + # If geckodriver is not installed, resort to a short list and warn. try: coauthor_info = self._get_coauthors_long(author) except Exception as err: @@ -255,15 +269,21 @@ def _fill_coauthors(self, soup, author): self.nav.logger.warning(err) self.nav.logger.warning("Fetching only the top 20 coauthors") - author['coauthors'] = [] + author["coauthors"] = [] for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info): new_coauthor = self.get_author(coauth_id) - new_coauthor['name'] = coauth_name - new_coauthor['affiliation'] = coauth_affil - new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST - author['coauthors'].append(new_coauthor) - - def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0): + new_coauthor["name"] = coauth_name + new_coauthor["affiliation"] = coauth_affil + new_coauthor["source"] = AuthorSource.CO_AUTHORS_LIST + author["coauthors"].append(new_coauthor) + + def fill( + self, + author, + sections: Optional[list] = None, + sortby="citedby", + publication_limit: int = 0, + ): """Populate the Author with information from their profile The `sections` argument allows for finer granularity of the profile @@ -427,42 +447,65 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit: 'source': 'SEARCH_AUTHOR_SNIPPETS', 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ'} """ + if sections is None: + sections = [] try: sections = [section.lower() for section in sections] - sections.sort(reverse=True) # Ensure 'publications' comes before 'public_access' - sortby_str = '' + sections.sort( + reverse=True + ) # Ensure 'publications' comes before 'public_access' + sortby_str = "" if sortby == "year": - sortby_str = '&view_op=list_works&sortby=pubdate' + sortby_str = "&view_op=list_works&sortby=pubdate" elif sortby != "citedby": - raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'") - url_citations = _CITATIONAUTH.format(author['scholar_id']) + raise Exception( + "Please enter a valid sortby parameter. Options: 'year', 'citedby'" + ) + url_citations = _CITATIONAUTH.format(author["scholar_id"]) url_citations += sortby_str - url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE) + url = f"{url_citations}&pagesize={_PAGESIZE}" soup = self.nav._get_soup(url) # Update scholar_id - scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0] - if scholar_id != author['scholar_id']: - self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. " - "To avoid this warning, use %s to look up this scholar.", - author['scholar_id'], scholar_id, scholar_id) + scholar_id = re.findall( + _CITATIONAUTHRE, soup.find("link", rel="canonical").get("href", "") + )[0] + if scholar_id != author["scholar_id"]: + self.nav.logger.warning( + "Changing the scholar_id following redirect from %s to %s. " + "To avoid this warning, use %s to look up this scholar.", + author["scholar_id"], + scholar_id, + scholar_id, + ) author["scholar_id"] = scholar_id if sections == []: for i in self._sections: - if i not in author['filled']: - (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) - author['filled'].append(i) + if i not in author["filled"]: + ( + getattr(self, f"_fill_{i}")(soup, author) + if i != "publications" + else getattr(self, f"_fill_{i}")( + soup, author, publication_limit, sortby_str + ) + ) + author["filled"].append(i) else: for i in sections: - if i in self._sections and i not in author['filled']: - (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) - author['filled'].append(i) + if i in self._sections and i not in author["filled"]: + ( + getattr(self, f"_fill_{i}")(soup, author) + if i != "publications" + else getattr(self, f"_fill_{i}")( + soup, author, publication_limit, sortby_str + ) + ) + author["filled"].append(i) except Exception as e: - raise(e) + raise (e) return author - def __repr__(self): return self.__str__() diff --git a/scholarly/data_types.py b/scholarly/data_types.py index 13a9a38a..bfb5d318 100644 --- a/scholarly/data_types.py +++ b/scholarly/data_types.py @@ -1,16 +1,9 @@ -import sys - from enum import Enum -from typing import List, Dict - -if sys.version_info >= (3, 8): - from typing import TypedDict -else: - from typing_extensions import TypedDict +from typing import TypedDict class PublicationSource(str, Enum): - ''' + """ Defines the source of the publication. In general, a publication on Google Scholar has two forms: * Appearing as a PUBLICATION SNIPPET and @@ -58,14 +51,15 @@ class PublicationSource(str, Enum): To fill in the publication, we open the "detailed view" of the paper Detailed view page: https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-Km63D4AAAAJ:d1gkVwhDpl0C - ''' + """ + PUBLICATION_SEARCH_SNIPPET = "PUBLICATION_SEARCH_SNIPPET" AUTHOR_PUBLICATION_ENTRY = "AUTHOR_PUBLICATION_ENTRY" JOURNAL_CITATION_LIST = "JOURNAL_CITATION_LIST" class AuthorSource(str, Enum): - ''' + """ Defines the source of the HTML that will be parsed. Author page: https://scholar.google.com/citations?hl=en&user=yxUduqMAAAAJ @@ -73,7 +67,8 @@ class AuthorSource(str, Enum): Search authors: https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=jordan&btnG= Coauthors: From the list of co-authors from an Author page - ''' + """ + AUTHOR_PROFILE_PAGE = "AUTHOR_PROFILE_PAGE" SEARCH_AUTHOR_SNIPPETS = "SEARCH_AUTHOR_SNIPPETS" CO_AUTHORS_LIST = "CO_AUTHORS_LIST" @@ -83,6 +78,7 @@ class ProxyMode(str, Enum): """ Defines the different types supported. """ + FREE_PROXIES = "FREE_PROXIES" SCRAPERAPI = "SCRAPERAPI" LUMINATI = "LUMINATI" @@ -92,14 +88,18 @@ class ProxyMode(str, Enum): TOR_INTERNAL = "TOR_INTERNAL" -''' Lightweight Data Structure to keep distribution of citations of the years ''' -CitesPerYear = Dict[int, int] +""" Lightweight Data Structure to keep distribution of citations of the years """ +CitesPerYear = dict[int, int] -''' Lightweight Data Structure to hold the numbers articles available or +""" Lightweight Data Structure to hold the numbers articles available or not available publicly according to funding mandates -''' -PublicAccess = TypedDict('PublicAccess', {"available": int, "not_available": int}) +""" + + +class PublicAccess(TypedDict): + available: int + not_available: int class BibEntry(TypedDict, total=False): @@ -122,6 +122,7 @@ class BibEntry(TypedDict, total=False): :param citation: Formatted citation string, usually containing journal name, volume and page numbers (source: AUTHOR_PUBLICATION_ENTRY) :param pub_url: url of the website providing the publication """ + pub_type: str bib_id: str abstract: str @@ -149,6 +150,7 @@ class Mandate(TypedDict, total=False): :param acknowledgement: text in the paper acknowledging the funding :param grant: grant ID that supported this work """ + agency: str url_policy: str url_policy_cached: str @@ -207,14 +209,14 @@ class Publication(TypedDict, total=False): bib: BibEntry gsrank: int - author_id: List[str] + author_id: list[str] num_citations: int - cites_id: List[str] + cites_id: list[str] citedby_url: str cites_per_year: CitesPerYear author_pub_id: str public_access: bool - mandates: List[Mandate] + mandates: list[Mandate] eprint_url: str pub_url: str url_add_sclib: str @@ -224,6 +226,7 @@ class Publication(TypedDict, total=False): source: PublicationSource container_type: str + class Author(TypedDict, total=False): """ :class:`Author ` object used to represent an author entry on Google Scholar. @@ -261,8 +264,8 @@ class Author(TypedDict, total=False): url_picture: str homepage: str citedby: int - filled: List[str] - interests: List[str] + filled: list[str] + interests: list[str] citedby5y: int hindex: int hindex5y: int @@ -270,11 +273,12 @@ class Author(TypedDict, total=False): i10index5y: int cites_per_year: CitesPerYear public_access: PublicAccess - publications: List[Publication] - coauthors: List # List of authors. No self dict functionality available + publications: list[Publication] + coauthors: list # List of authors. No self dict functionality available container_type: str source: AuthorSource + class Journal(TypedDict, total=False): """ :class:`Journal ` object used to represent a journal entry on Google Scholar. diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index 5d7e7287..7b0e42c8 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -1,54 +1,65 @@ import re -import bibtexparser +from typing import Optional + import arrow +import bibtexparser from bibtexparser.bibdatabase import BibDatabase -from .data_types import BibEntry, Mandate, Publication, PublicationSource +from .data_types import BibEntry, Mandate, Publication, PublicationSource -_SCHOLARPUBRE = r'cites=([\d,]*)' -_CITATIONPUB = '/citations?hl=en&view_op=view_citation&citation_for_view={0}' -_SCHOLARPUB = '/scholar?hl=en&oi=bibs&cites={0}' -_CITATIONPUBRE = r'citation_for_view=([\w-]*:[\w-]*)' -_BIBCITE = '/scholar?hl=en&q=info:{0}:scholar.google.com/\ -&output=cite&scirp={1}&hl=en' -_CITEDBYLINK = '/scholar?hl=en&cites={0}' -_MANDATES_URL = '/citations?view_op=view_mandate&hl=en&citation_for_view={0}' +_SCHOLARPUBRE = r"cites=([\d,]*)" +_CITATIONPUB = "/citations?hl=en&view_op=view_citation&citation_for_view={0}" +_SCHOLARPUB = "/scholar?hl=en&oi=bibs&cites={0}" +_CITATIONPUBRE = r"citation_for_view=([\w-]*:[\w-]*)" +_BIBCITE = "/scholar?hl=en&q=info:{0}:scholar.google.com/\ +&output=cite&scirp={1}&hl=en" +_CITEDBYLINK = "/scholar?hl=en&cites={0}" +_MANDATES_URL = "/citations?view_op=view_mandate&hl=en&citation_for_view={0}" _BIB_MAPPING = { - 'ENTRYTYPE': 'pub_type', - 'ID': 'bib_id', - 'year': 'pub_year', + "ENTRYTYPE": "pub_type", + "ID": "bib_id", + "year": "pub_year", } _BIB_DATATYPES = { - 'number': 'str', - 'volume': 'str', + "number": "str", + "volume": "str", } _BIB_REVERSE_MAPPING = { - 'pub_type': 'ENTRYTYPE', - 'bib_id': 'ID', + "pub_type": "ENTRYTYPE", + "bib_id": "ID", } -def remap_bib(parsed_bib: dict, mapping: dict, data_types:dict ={}) -> BibEntry: + +def remap_bib( + parsed_bib: dict, mapping: dict, data_types: Optional[dict] = None +) -> BibEntry: + if data_types is None: + data_types = {} for key, value in mapping.items(): if key in parsed_bib: parsed_bib[value] = parsed_bib.pop(key) for key, value in data_types.items(): - if key in parsed_bib: - if value == 'int': - parsed_bib[key] = int(parsed_bib[key]) + if key in parsed_bib and value == "int": + parsed_bib[key] = int(parsed_bib[key]) return parsed_bib -class _SearchScholarIterator(object): + +class _SearchScholarIterator: """Iterator that returns Publication objects from the search page I have removed all logging from here for simplicity. -V """ def __init__(self, nav, url: str): self._url = url - self._pubtype = PublicationSource.PUBLICATION_SEARCH_SNIPPET if "/scholar?" in url else PublicationSource.JOURNAL_CITATION_LIST + self._pubtype = ( + PublicationSource.PUBLICATION_SEARCH_SNIPPET + if "/scholar?" in url + else PublicationSource.JOURNAL_CITATION_LIST + ) self._nav = nav self._load_url(url) self.total_results = self._get_total_results() @@ -58,18 +69,22 @@ def _load_url(self, url: str): # this is temporary until setup json file self._soup = self._nav._get_soup(url) self._pos = 0 - self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gs_r gs_or gs_scl gs_fmar') + self._soup.find_all('div', class_='gsc_mpat_ttl') + self._rows = ( + self._soup.find_all("div", class_="gs_r gs_or gs_scl") + + self._soup.find_all("div", class_="gs_r gs_or gs_scl gs_fmar") + + self._soup.find_all("div", class_="gsc_mpat_ttl") + ) def _get_total_results(self): if self._soup.find("div", class_="gs_pda"): return None - for x in self._soup.find_all('div', class_='gs_ab_mdw'): + for x in self._soup.find_all("div", class_="gs_ab_mdw"): # Accounting for different thousands separators: # comma, dot, space, apostrophe - match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.\s’]+)', string=x.text) + match = re.match(pattern=r"(^|\s*About)\s*([0-9,\.\s’]+)", string=x.text) if match: - return int(re.sub(pattern=r'[,\.\s’]',repl='', string=match.group(2))) + return int(re.sub(pattern=r"[,\.\s’]", repl="", string=match.group(2))) return len(self._rows) # Iterator protocol @@ -83,9 +98,8 @@ def __next__(self): self._pos += 1 res = self.pub_parser.get_publication(row, self._pubtype) return res - elif self._soup.find(class_='gs_ico gs_ico_nav_next'): - url = self._soup.find( - class_='gs_ico gs_ico_nav_next').parent['href'] + elif self._soup.find(class_="gs_ico gs_ico_nav_next"): + url = self._soup.find(class_="gs_ico gs_ico_nav_next").parent["href"] self._url = url self._load_url(url) return self.__next__() @@ -94,15 +108,15 @@ def __next__(self): # Pickle protocol def __getstate__(self): - return {'url': self._url, 'pos': self._pos} + return {"url": self._url, "pos": self._pos} def __setstate__(self, state): # this needs validation -V - self._load_url(state['url']) - self._pos = state['pos'] + self._load_url(state["url"]) + self._pos = state["pos"] -class PublicationParser(object): +class PublicationParser: """Returns an object for a single publication""" def __init__(self, nav): @@ -110,46 +124,46 @@ def __init__(self, nav): def _citation_pub(self, __data, publication: Publication): # create the bib entry in the dictionary - publication['bib']['title'] = __data.find('a', class_='gsc_a_at').text - publication['author_pub_id'] = re.findall(_CITATIONPUBRE, __data.find( - 'a', class_='gsc_a_at')['href'])[0] - citedby = __data.find(class_='gsc_a_ac') + publication["bib"]["title"] = __data.find("a", class_="gsc_a_at").text + publication["author_pub_id"] = re.findall( + _CITATIONPUBRE, __data.find("a", class_="gsc_a_at")["href"] + )[0] + citedby = __data.find(class_="gsc_a_ac") publication["num_citations"] = 0 - if citedby and not (citedby.text.isspace() or citedby.text == ''): + if citedby and not (citedby.text.isspace() or citedby.text == ""): publication["num_citations"] = int(citedby.text.strip()) publication["citedby_url"] = citedby["href"] - publication["cites_id"] = re.findall(_SCHOLARPUBRE, citedby["href"])[0].split(',') + publication["cites_id"] = re.findall(_SCHOLARPUBRE, citedby["href"])[ + 0 + ].split(",") - year = __data.find(class_='gsc_a_h') - if (year and year.text - and not year.text.isspace() - and len(year.text) > 0): - publication['bib']['pub_year'] = year.text.strip() + year = __data.find(class_="gsc_a_h") + if year and year.text and not year.text.isspace() and len(year.text) > 0: + publication["bib"]["pub_year"] = year.text.strip() - author_citation = __data.find_all('div', class_='gs_gray') + author_citation = __data.find_all("div", class_="gs_gray") try: citation = author_citation[1].text except IndexError: citation = "" - publication['bib']['citation'] = citation + publication["bib"]["citation"] = citation return publication - def get_publication(self, __data, pubtype: PublicationSource)->Publication: - """Returns a publication that has either 'citation' or 'scholar' source - """ + def get_publication(self, __data, pubtype: PublicationSource) -> Publication: + """Returns a publication that has either 'citation' or 'scholar' source""" - publication: Publication = {'container_type': 'Publication'} - publication['source'] = pubtype - publication['bib'] = {} - publication['filled'] = False + publication: Publication = {"container_type": "Publication"} + publication["source"] = pubtype + publication["bib"] = {} + publication["filled"] = False - if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + if publication["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: return self._citation_pub(__data, publication) - elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + elif publication["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: return self._scholar_pub(__data, publication) - elif publication['source'] == PublicationSource.JOURNAL_CITATION_LIST: + elif publication["source"] == PublicationSource.JOURNAL_CITATION_LIST: return publication # TODO: self._journal_pub(__data, publication) else: @@ -157,26 +171,32 @@ def get_publication(self, __data, pubtype: PublicationSource)->Publication: def _get_authorlist(self, authorinfo): authorlist = list() - text = authorinfo.split(' - ')[0] - for i in text.split(','): + text = authorinfo.split(" - ")[0] + for i in text.split(","): i = i.strip() - if bool(re.search(r'\d', i)): + if bool(re.search(r"\d", i)): continue - if ("Proceedings" in i or "Conference" in i or "Journal" in i or - "(" in i or ")" in i or "[" in i or "]" in i or - "Transactions" in i): + if ( + "Proceedings" in i + or "Conference" in i + or "Journal" in i + or "(" in i + or ")" in i + or "[" in i + or "]" in i + or "Transactions" in i + ): continue i = i.replace("…", "") authorlist.append(i) return authorlist - def _get_author_id_list(self, authorinfo_inner_html): author_id_list = list() - html = authorinfo_inner_html.split(' - ')[0] - for author_html in html.split(','): + html = authorinfo_inner_html.split(" - ")[0] + for author_html in html.split(","): author_html = author_html.strip() - match = re.search('\\?user=(.*?)&', author_html) + match = re.search("\\?user=(.*?)&", author_html) if match: author_id_list.append(match.groups()[0]) else: @@ -184,29 +204,29 @@ def _get_author_id_list(self, authorinfo_inner_html): return author_id_list def _scholar_pub(self, __data, publication: Publication): - databox = __data.find('div', class_='gs_ri') - title = databox.find('h3', class_='gs_rt') + databox = __data.find("div", class_="gs_ri") + title = databox.find("h3", class_="gs_rt") - cid = __data.get('data-cid') - pos = __data.get('data-rp') + cid = __data.get("data-cid") + pos = __data.get("data-rp") - publication['gsrank'] = int(pos) + 1 + publication["gsrank"] = int(pos) + 1 - if title.find('span', class_='gs_ctu'): # A citation + if title.find("span", class_="gs_ctu"): # A citation title.span.extract() - elif title.find('span', class_='gs_ctc'): # A book or PDF + elif title.find("span", class_="gs_ctc"): # A book or PDF title.span.extract() - publication['bib']['title'] = title.text.strip() + publication["bib"]["title"] = title.text.strip() - if title.find('a'): - publication['pub_url'] = title.find('a')['href'] + if title.find("a"): + publication["pub_url"] = title.find("a")["href"] - author_div_element = databox.find('div', class_='gs_a') + author_div_element = databox.find("div", class_="gs_a") authorinfo = author_div_element.text - authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP - authorinfo = authorinfo.replace(u'&', u'&') # Ampersand - publication['bib']["author"] = self._get_authorlist(authorinfo) + authorinfo = authorinfo.replace("\xa0", " ") # NBSP + authorinfo = authorinfo.replace("&", "&") # Ampersand + publication["bib"]["author"] = self._get_authorlist(authorinfo) authorinfo_html = author_div_element.decode_contents() publication["author_id"] = self._get_author_id_list(authorinfo_html) @@ -219,162 +239,184 @@ def _scholar_pub(self, __data, publication: Publication): # the middle venue/year part. In principle the venue is separated # from the year by a comma. However, there exist venues with commas # and as shown above there might not always be a venue AND a year... - venueyear = authorinfo.split(' - ') + venueyear = authorinfo.split(" - ") # If there is no middle part (A) then venue and year are unknown. if len(venueyear) <= 2: - publication['bib']['venue'], publication['bib']['pub_year'] = 'NA', 'NA' + publication["bib"]["venue"], publication["bib"]["pub_year"] = "NA", "NA" else: - venueyear = venueyear[1].split(',') - venue = 'NA' + venueyear = venueyear[1].split(",") + venue = "NA" year = venueyear[-1].strip() if year.isnumeric() and len(year) == 4: - publication['bib']['pub_year'] = year + publication["bib"]["pub_year"] = year if len(venueyear) >= 2: - venue = ','.join(venueyear[0:-1]) # everything but last + venue = ",".join(venueyear[0:-1]) # everything but last else: - venue = ','.join(venueyear) # everything - publication['bib']['pub_year'] = 'NA' - publication['bib']['venue'] = venue - - if databox.find('div', class_='gs_rs'): - publication['bib']['abstract'] = databox.find('div', class_='gs_rs').text - publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\u2026', u'') - publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\n', u' ') - publication['bib']['abstract'] = publication['bib']['abstract'].strip() - - if publication['bib']['abstract'][0:8].lower() == 'abstract': - publication['bib']['abstract'] = publication['bib']['abstract'][9:].strip() - - publication['url_scholarbib'] = _BIBCITE.format(cid, pos) + venue = ",".join(venueyear) # everything + publication["bib"]["pub_year"] = "NA" + publication["bib"]["venue"] = venue + + if databox.find("div", class_="gs_rs"): + publication["bib"]["abstract"] = databox.find("div", class_="gs_rs").text + publication["bib"]["abstract"] = publication["bib"]["abstract"].replace( + "\u2026", "" + ) + publication["bib"]["abstract"] = publication["bib"]["abstract"].replace( + "\n", " " + ) + publication["bib"]["abstract"] = publication["bib"]["abstract"].strip() + + if publication["bib"]["abstract"][0:8].lower() == "abstract": + publication["bib"]["abstract"] = publication["bib"]["abstract"][ + 9: + ].strip() + + publication["url_scholarbib"] = _BIBCITE.format(cid, pos) sclib = self.nav.publib.format(id=cid) - publication['url_add_sclib'] = sclib + publication["url_add_sclib"] = sclib - lowerlinks = databox.find('div', class_='gs_fl').find_all('a') + lowerlinks = databox.find("div", class_="gs_fl").find_all("a") publication["num_citations"] = 0 for link in lowerlinks: - if 'Cited by' in link.text: - publication['num_citations'] = int(re.findall(r'\d+', link.text)[0].strip()) - publication['citedby_url'] = link['href'] - - if 'Related articles' in link.text: - publication['url_related_articles'] = link['href'] - - if __data.find('div', class_='gs_ggs gs_fl'): - publication['eprint_url'] = __data.find( - 'div', class_='gs_ggs gs_fl').a['href'] + if "Cited by" in link.text: + publication["num_citations"] = int( + re.findall(r"\d+", link.text)[0].strip() + ) + publication["citedby_url"] = link["href"] + + if "Related articles" in link.text: + publication["url_related_articles"] = link["href"] + + if __data.find("div", class_="gs_ggs gs_fl"): + publication["eprint_url"] = __data.find("div", class_="gs_ggs gs_fl").a[ + "href" + ] return publication - - def fill(self, publication: Publication)->Publication: + def fill(self, publication: Publication) -> Publication: """Populate the Publication with information from its profile :param publication: Scholar or Citation publication container object that is not filled :type publication: PublicationCitation or PublicationScholar """ - if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: - url = _CITATIONPUB.format(publication['author_pub_id']) + if publication["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + url = _CITATIONPUB.format(publication["author_pub_id"]) soup = self.nav._get_soup(url) - publication['bib']['title'] = soup.find('div', id='gsc_oci_title').text - if publication['bib']['title'][-1] == '\u2026': - merged_snippet = soup.find('div', class_='gsc_oci_merged_snippet') + publication["bib"]["title"] = soup.find("div", id="gsc_oci_title").text + if publication["bib"]["title"][-1] == "\u2026": + merged_snippet = soup.find("div", class_="gsc_oci_merged_snippet") if merged_snippet: - title_div = merged_snippet.find('div') + title_div = merged_snippet.find("div") if title_div: - publication['bib']['title'] = title_div.text - if soup.find('a', class_='gsc_oci_title_link'): - publication['pub_url'] = soup.find( - 'a', class_='gsc_oci_title_link')['href'] - for item in soup.find_all('div', class_='gs_scl'): - key = item.find(class_='gsc_oci_field').text.strip().lower() - val = item.find(class_='gsc_oci_value') - if key == 'authors' or key == 'inventors': - publication['bib']['author'] = ' and '.join( - [i.strip() for i in val.text.split(',')]) - elif key == 'journal': - publication['bib']['journal'] = val.text - elif key == 'conference': - publication['bib']['conference'] = val.text - elif key == 'volume': - publication['bib']['volume'] = val.text - elif key == 'issue': - publication['bib']['number'] = val.text - elif key == 'pages': - publication['bib']['pages'] = val.text - elif key == 'publisher': - publication['bib']['publisher'] = val.text - elif key == 'publication date': - - patterns = ['YYYY/M', - 'YYYY/MM/DD', - 'YYYY', - 'YYYY/M/DD', - 'YYYY/M/D', - 'YYYY/MM/D'] - publication['bib']['pub_year'] = arrow.get(val.text, patterns).year - publication['bib']['pub_date'] = val.text - elif key == 'description': + publication["bib"]["title"] = title_div.text + if soup.find("a", class_="gsc_oci_title_link"): + publication["pub_url"] = soup.find("a", class_="gsc_oci_title_link")[ + "href" + ] + for item in soup.find_all("div", class_="gs_scl"): + key = item.find(class_="gsc_oci_field").text.strip().lower() + val = item.find(class_="gsc_oci_value") + if key == "authors" or key == "inventors": + publication["bib"]["author"] = " and ".join( + [i.strip() for i in val.text.split(",")] + ) + elif key == "journal": + publication["bib"]["journal"] = val.text + elif key == "conference": + publication["bib"]["conference"] = val.text + elif key == "volume": + publication["bib"]["volume"] = val.text + elif key == "issue": + publication["bib"]["number"] = val.text + elif key == "pages": + publication["bib"]["pages"] = val.text + elif key == "publisher": + publication["bib"]["publisher"] = val.text + elif key == "publication date": + patterns = [ + "YYYY/M", + "YYYY/MM/DD", + "YYYY", + "YYYY/M/DD", + "YYYY/M/D", + "YYYY/MM/D", + ] + publication["bib"]["pub_year"] = arrow.get(val.text, patterns).year + publication["bib"]["pub_date"] = val.text + elif key == "description": # try to find all the gsh_csp if they exist - abstract = val.find_all(class_='gsh_csp') + abstract = val.find_all(class_="gsh_csp") result = "" # append all gsh_csp together as there can be multiple in certain scenarios for item in abstract: - if item.text[0:8].lower() == 'abstract': + if item.text[0:8].lower() == "abstract": result += item.text[9:].strip() else: result += item.text if len(abstract) == 0: # if no gsh_csp were found - abstract = val.find(class_='gsh_small') + abstract = val.find(class_="gsh_small") if abstract: - if abstract.text[0:8].lower() == 'abstract': + if abstract.text[0:8].lower() == "abstract": result = abstract.text[9:].strip() else: result = abstract.text else: - result = ' '.join([description_part for description_part in val]) - - publication['bib']['abstract'] = result - elif key == 'total citations': - publication['cites_id'] = re.findall( - _SCHOLARPUBRE, val.a['href'])[0].split(',') - publication['citedby_url'] = _CITEDBYLINK.format(','.join(publication['cites_id'])) - elif key == 'scholar articles': - for entry in val.find_all('a'): - if entry.text.lower() == 'related articles': - publication['url_related_articles'] = entry.get('href')[26:] + result = " ".join( + [description_part for description_part in val] + ) + + publication["bib"]["abstract"] = result + elif key == "total citations": + publication["cites_id"] = re.findall(_SCHOLARPUBRE, val.a["href"])[ + 0 + ].split(",") + publication["citedby_url"] = _CITEDBYLINK.format( + ",".join(publication["cites_id"]) + ) + elif key == "scholar articles": + for entry in val.find_all("a"): + if entry.text.lower() == "related articles": + publication["url_related_articles"] = entry.get("href")[26:] break # number of citation per year - years = [int(y.text) for y in soup.find_all(class_='gsc_oci_g_t')] - cites = [int(c.text) for c in soup.find_all(class_='gsc_oci_g_al')] - cites_year = [int(c.get('href')[-4:]) for c in soup.find_all(class_='gsc_oci_g_a')] + years = [int(y.text) for y in soup.find_all(class_="gsc_oci_g_t")] + cites = [int(c.text) for c in soup.find_all(class_="gsc_oci_g_al")] + cites_year = [ + int(c.get("href")[-4:]) for c in soup.find_all(class_="gsc_oci_g_a") + ] nonzero_cites_per_year = dict(zip(cites_year, cites)) res_dict = {} for year in years: - res_dict[year] = (nonzero_cites_per_year[year] if year in nonzero_cites_per_year else 0) - publication['cites_per_year'] = res_dict + res_dict[year] = nonzero_cites_per_year.get(year, 0) + publication["cites_per_year"] = res_dict - if soup.find('div', class_='gsc_vcd_title_ggi'): - publication['eprint_url'] = soup.find( - 'div', class_='gsc_vcd_title_ggi').a['href'] + if soup.find("div", class_="gsc_vcd_title_ggi"): + publication["eprint_url"] = soup.find( + "div", class_="gsc_vcd_title_ggi" + ).a["href"] - if publication.get('public_access', None): - publication['mandates'] = [] + if publication.get("public_access", None): + publication["mandates"] = [] self._fill_public_access_mandates(publication) - publication['filled'] = True - elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: - bibtex_url = self._get_bibtex(publication['url_scholarbib']) + publication["filled"] = True + elif publication["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + bibtex_url = self._get_bibtex(publication["url_scholarbib"]) bibtex = self.nav._get_page(bibtex_url) parser = bibtexparser.bparser.BibTexParser(common_strings=True) - parsed_bib = remap_bib(bibtexparser.loads(bibtex,parser).entries[-1], _BIB_MAPPING, _BIB_DATATYPES) - publication['bib'].update(parsed_bib) - publication['filled'] = True + parsed_bib = remap_bib( + bibtexparser.loads(bibtex, parser).entries[-1], + _BIB_MAPPING, + _BIB_DATATYPES, + ) + publication["bib"].update(parsed_bib) + publication["filled"] = True return publication - def citedby(self, publication: Publication) -> _SearchScholarIterator or list: """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. @@ -385,9 +427,9 @@ def citedby(self, publication: Publication) -> _SearchScholarIterator or list: :getter: Returns a Generator of Publications that cited the current. :type: Iterator[:class:`Publication`] """ - if not publication['filled']: + if not publication["filled"]: publication = self.fill(publication) - return _SearchScholarIterator(self.nav, publication['citedby_url']) + return _SearchScholarIterator(self.nav, publication["citedby_url"]) def bibtex(self, publication: Publication) -> str: """Returns the publication as a Bibtex entry @@ -398,15 +440,15 @@ def bibtex(self, publication: Publication) -> str: :getter: Returns a Bibtex entry in text format :type: str """ - if not publication['filled']: + if not publication["filled"]: publication = self.fill(publication) a = BibDatabase() - converted_dict = publication['bib'] + converted_dict = publication["bib"] try: - url = publication['eprint_url'] + url = publication["eprint_url"] except KeyError: - url = publication.get('pub_url', '') - converted_dict['url'] = url + url = publication.get("pub_url", "") + converted_dict["url"] = url converted_dict = remap_bib(converted_dict, _BIB_REVERSE_MAPPING) str_dict = {key: str(value) for key, value in converted_dict.items()} # convert every key of the dictionary to string to be Bibtex compatible @@ -417,37 +459,48 @@ def _get_bibtex(self, bib_url) -> str: """Retrieves the bibtex url""" soup = self.nav._get_soup(bib_url) - styles = soup.find_all('a', class_='gs_citi') + styles = soup.find_all("a", class_="gs_citi") for link in styles: if link.string.lower() == "bibtex": - return link.get('href') - return '' + return link.get("href") + return "" def _fill_public_access_mandates(self, publication: Publication) -> None: """Fills the public access mandates""" - if publication.get('public_access', None): - soup = self.nav._get_soup(_MANDATES_URL.format(publication['author_pub_id'])) - mandates = soup.find_all('li') + if publication.get("public_access", None): + soup = self.nav._get_soup( + _MANDATES_URL.format(publication["author_pub_id"]) + ) + mandates = soup.find_all("li") for mandate in mandates: m = Mandate() - m['agency'] = mandate.find('span', class_='gsc_md_mndt_name').text - m['url_policy'] = mandate.find('div', class_='gsc_md_mndt_title').a['href'] - m['url_policy_cached'] = mandate.find('span', class_='gs_a').a['href'] - for desc in mandate.find_all('div', class_='gsc_md_mndt_desc'): + m["agency"] = mandate.find("span", class_="gsc_md_mndt_name").text + m["url_policy"] = mandate.find("div", class_="gsc_md_mndt_title").a[ + "href" + ] + m["url_policy_cached"] = mandate.find("span", class_="gs_a").a["href"] + for desc in mandate.find_all("div", class_="gsc_md_mndt_desc"): match = re.search("Effective date: [0-9]{4}/[0-9]{1,2}", desc.text) if match: - m['effective_date'] = re.sub(pattern="Effective date: ", repl="", - string=desc.text[match.start() : match.end()]) + m["effective_date"] = re.sub( + pattern="Effective date: ", + repl="", + string=desc.text[match.start() : match.end()], + ) match = re.search("Embargo: ", desc.text) if match: - m['embargo'] = re.sub(pattern="Embargo: ", repl="", string=desc.text[match.end():]) + m["embargo"] = re.sub( + pattern="Embargo: ", + repl="", + string=desc.text[match.end() :], + ) if "Grant: " in desc.text: - m['grant'] = desc.text.split("Grant: ")[1] + m["grant"] = desc.text.split("Grant: ")[1] if "Funding acknowledgment" in desc.text: - m['acknowledgement'] = desc.find('span', class_='gs_gray').text + m["acknowledgement"] = desc.find("span", class_="gs_gray").text - publication['mandates'].append(m) + publication["mandates"].append(m) diff --git a/setup.py b/setup.py index b4ea378d..d91381f4 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ import setuptools -with open("README.md", "r") as fh: +with open('README.md') as fh: long_description = fh.read() setuptools.setup( @@ -10,9 +10,8 @@ author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu', description='Simple access to Google Scholar authors and citations', long_description=long_description, - long_description_content_type="text/markdown", + long_description_content_type='text/markdown', license='Unlicense', - url='https://github.com/scholarly-python-package/scholarly', packages=setuptools.find_packages(), keywords=['Google Scholar', 'academics', 'citations'], @@ -23,22 +22,23 @@ 'Natural Language :: English', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Topic :: Software Development :: Libraries :: Python Modules'], - install_requires=['arrow', - 'beautifulsoup4', - 'bibtexparser', - 'deprecated', - 'fake_useragent', - 'free-proxy', - 'httpx', - 'python-dotenv', - 'requests[socks]', - 'selenium', - 'sphinx_rtd_theme', - 'typing_extensions' - ], + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + install_requires=[ + 'arrow', + 'beautifulsoup4', + 'bibtexparser', + 'deprecated', + 'fake_useragent', + 'free-proxy', + 'httpx', + 'python-dotenv', + 'requests[socks]', + 'selenium', + 'sphinx_rtd_theme', + ], extras_require={ 'tor': ['stem'], }, - test_suite="test_module.py" + test_suite='test_module.py', ) diff --git a/test_module.py b/test_module.py index bcd93e44..cf98dc28 100644 --- a/test_module.py +++ b/test_module.py @@ -1,16 +1,19 @@ -import unittest +import csv +import json import os +import random import sys +import unittest from collections import Counter -from scholarly import scholarly, ProxyGenerator -from scholarly.data_types import Mandate -from scholarly.publication_parser import PublicationParser -import random -import json -import csv +from contextlib import contextmanager + import requests from bs4 import BeautifulSoup -from contextlib import contextmanager + +from scholarly import ProxyGenerator, scholarly +from scholarly.data_types import Mandate +from scholarly.publication_parser import PublicationParser + try: import pandas as pd except ImportError: @@ -18,48 +21,49 @@ class TestLuminati(unittest.TestCase): - skipUnless = os.getenv("USERNAME") and os.getenv("PASSWORD") and os.getenv("PORT") + skipUnless = os.getenv('USERNAME') and os.getenv('PASSWORD') and os.getenv('PORT') - @unittest.skipUnless(skipUnless, reason="No Luminati credentials found.") + @unittest.skipUnless(skipUnless, reason='No Luminati credentials found.') def test_luminati(self): - """ - Test that we can set up Luminati (Bright Data) successfully - """ + """Test that we can set up Luminati (Bright Data) successfully.""" proxy_generator = ProxyGenerator() - success = proxy_generator.Luminati(usr=os.getenv("USERNAME"), - passwd=os.getenv("PASSWORD"), - proxy_port=os.getenv("PORT")) + success = proxy_generator.Luminati( + usr=os.getenv('USERNAME'), + passwd=os.getenv('PASSWORD'), + proxy_port=os.getenv('PORT'), + ) self.assertTrue(success) - self.assertEqual(proxy_generator.proxy_mode, "LUMINATI") + self.assertEqual(proxy_generator.proxy_mode, 'LUMINATI') class TestScraperAPI(unittest.TestCase): skipUnless = os.getenv('SCRAPER_API_KEY') - @unittest.skipUnless(skipUnless, reason="No ScraperAPI key found") + @unittest.skipUnless(skipUnless, reason='No ScraperAPI key found') def test_scraperapi(self): - """ - Test that we can set up ScraperAPI successfully - """ + """Test that we can set up ScraperAPI successfully.""" proxy_generator = ProxyGenerator() success = proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) self.assertTrue(success) - self.assertEqual(proxy_generator.proxy_mode, "SCRAPERAPI") + self.assertEqual(proxy_generator.proxy_mode, 'SCRAPERAPI') class TestTorInternal(unittest.TestCase): - skipUnless = [_bin for path in sys.path if os.path.isdir(path) for _bin in os.listdir(path) - if _bin in ('tor', 'tor.exe')] + skipUnless = [ + _bin + for path in sys.path + if os.path.isdir(path) + for _bin in os.listdir(path) + if _bin in ('tor', 'tor.exe') + ] @unittest.skipUnless(skipUnless, reason='Tor executable not found') def test_tor_launch_own_process(self): - """ - Test that we can launch a Tor process - """ + """Test that we can launch a Tor process.""" proxy_generator = ProxyGenerator() - if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): + if sys.platform.startswith('linux') or sys.platform.startswith('darwin'): tor_cmd = 'tor' - elif sys.platform.startswith("win"): + elif sys.platform.startswith('win'): tor_cmd = 'tor.exe' else: tor_cmd = None @@ -68,19 +72,18 @@ def test_tor_launch_own_process(self): tor_control_port = random.randrange(9500, 9999) result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port) - self.assertTrue(result["proxy_works"]) - self.assertTrue(result["refresh_works"]) - self.assertEqual(result["tor_control_port"], tor_control_port) - self.assertEqual(result["tor_sock_port"], tor_sock_port) + self.assertTrue(result['proxy_works']) + self.assertTrue(result['refresh_works']) + self.assertEqual(result['tor_control_port'], tor_control_port) + self.assertEqual(result['tor_sock_port'], tor_sock_port) # Check that we can issue a query as well query = 'Ipeirotis' scholarly.use_proxy(proxy_generator) - authors = [a for a in scholarly.search_author(query)] + authors = list(scholarly.search_author(query)) self.assertGreaterEqual(len(authors), 1) class TestScholarly(unittest.TestCase): - @classmethod def setUpClass(cls): scholarly.set_timeout(5) @@ -93,22 +96,22 @@ def setUpClass(cls): # Try storing the file temporarily as `scholarly.csv` and delete it. # If there exists already a file with that name, generate a random name # that does not exist yet, so we can safely delete it. - cls.mandates_filename = "scholarly.csv" + cls.mandates_filename = 'scholarly.csv' while os.path.exists(cls.mandates_filename): - cls.mandates_filename = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + ".csv" + cls.mandates_filename = ( + ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + '.csv' + ) @classmethod def tearDownClass(cls): - """ - Clean up the mandates csv fiile downloaded. - """ + """Clean up the mandates csv fiile downloaded.""" if os.path.exists(cls.mandates_filename): os.remove(cls.mandates_filename) @staticmethod @contextmanager def suppress_stdout(): - with open(os.devnull, "w") as devnull: + with open(os.devnull, 'w') as devnull: old_stdout = sys.stdout sys.stdout = devnull try: @@ -117,10 +120,8 @@ def suppress_stdout(): sys.stdout = old_stdout def test_search_author_empty_author(self): - """ - Test that sholarly.search_author('') returns no authors - """ - authors = [a for a in scholarly.search_author('')] + """Test that sholarly.search_author('') returns no authors.""" + authors = list(scholarly.search_author('')) self.assertIs(len(authors), 0) def test_search_keywords(self): @@ -131,18 +132,15 @@ def test_search_keywords(self): self.assertEqual(author['affiliation'], 'Cornell University') def test_search_keyword_empty_keyword(self): - """ - As of 2020-04-30, there are 6 individuals that match the name 'label' - """ + """As of 2020-04-30, there are 6 individuals that match the name 'label'.""" # TODO this seems like undesirable functionality for # scholarly.search_keyword() with empty string. Surely, no authors # should be returned. Consider modifying the method itself. - authors = [a for a in scholarly.search_keyword('')] + authors = list(scholarly.search_keyword('')) self.assertGreaterEqual(len(authors), 6) def test_search_keyword(self): - """ - Test that we can search based on specific keywords + """Test that we can search based on specific keywords. When we search for the keyword "3d shape" the author Steven A. Cholewiak should be among those listed. @@ -152,76 +150,99 @@ def test_search_keyword(self): # Example 1 authors = [a['name'] for a in scholarly.search_keyword('3d shape')] self.assertIsNot(len(authors), 0) - self.assertIn(u'Steven A. Cholewiak, PhD', authors) + self.assertIn('Steven A. Cholewiak, PhD', authors) # Example 2 - expected_author = {'affiliation': 'Stanford University', - 'citedby': 43856, - 'email_domain': '@cs.stanford.edu', - 'filled': [], - 'interests': ['Robotics', - 'Haptics', - 'Human Motion Understanding'], - 'name': 'Oussama Khatib', - 'scholar_id': '4arkOLcAAAAJ', - 'source': 'SEARCH_AUTHOR_SNIPPETS', - 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4arkOLcAAAAJ' - } + expected_author = { + 'affiliation': 'Stanford University', + 'citedby': 43856, + 'email_domain': '@cs.stanford.edu', + 'filled': [], + 'interests': ['Robotics', 'Haptics', 'Human Motion Understanding'], + 'name': 'Oussama Khatib', + 'scholar_id': '4arkOLcAAAAJ', + 'source': 'SEARCH_AUTHOR_SNIPPETS', + 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4arkOLcAAAAJ', + } search_query = scholarly.search_keyword('Haptics') author = next(search_query) for key in author: - if (key not in {"citedby", "container_type", "interests"}) and (key in expected_author): + if (key not in {'citedby', 'container_type', 'interests'}) and ( + key in expected_author + ): self.assertEqual(author[key], expected_author[key]) - self.assertEqual(set(author["interests"]), set(expected_author["interests"])) + self.assertEqual(set(author['interests']), set(expected_author['interests'])) # Example 3 - expected_author = {'affiliation': "CEA, Département d'Astrophysique", - 'citedby': 98936, - 'email_domain': '@cea.fr', - 'filled': [], - 'interests': ['Cosmology (CMB', - 'weak-lensing', - 'large scale structure)', - 'Statistics', - 'Image Processing'], - 'name': 'Jean-Luc Starck', - 'scholar_id': 'IAaAiXgAAAAJ', - 'source': 'SEARCH_AUTHOR_SNIPPETS', - 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=IAaAiXgAAAAJ' - } + expected_author = { + 'affiliation': "CEA, Département d'Astrophysique", + 'citedby': 98936, + 'email_domain': '@cea.fr', + 'filled': [], + 'interests': [ + 'Cosmology (CMB', + 'weak-lensing', + 'large scale structure)', + 'Statistics', + 'Image Processing', + ], + 'name': 'Jean-Luc Starck', + 'scholar_id': 'IAaAiXgAAAAJ', + 'source': 'SEARCH_AUTHOR_SNIPPETS', + 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=IAaAiXgAAAAJ', + } search_query = scholarly.search_keyword('large-scale structure') author = next(search_query) for key in author: - if (key not in {"citedby", "container_type", "interests"}) and (key in expected_author): + if (key not in {'citedby', 'container_type', 'interests'}) and ( + key in expected_author + ): self.assertEqual(author[key], expected_author[key]) scholarly.pprint(author) - self.assertEqual(set(author["interests"]), set(expected_author["interests"])) + self.assertEqual(set(author['interests']), set(expected_author['interests'])) def test_search_author_single_author(self): query = 'Steven A. Cholewiak' - authors = [a for a in scholarly.search_author(query)] + authors = list(scholarly.search_author(query)) self.assertGreaterEqual(len(authors), 1) author = scholarly.fill(authors[0]) - self.assertEqual(author['name'], u'Steven A. Cholewiak, PhD') - self.assertEqual(author['scholar_id'], u'4bahYMkAAAAJ') + self.assertEqual(author['name'], 'Steven A. Cholewiak, PhD') + self.assertEqual(author['scholar_id'], '4bahYMkAAAAJ') - self.assertEqual(author['homepage'], "http://steven.cholewiak.com/") + self.assertEqual(author['homepage'], 'http://steven.cholewiak.com/') self.assertEqual(author['organization'], 6518679690484165796) self.assertGreaterEqual(author['public_access']['available'], 10) - self.assertEqual(author['public_access']['available'], - sum(pub.get('public_access', None) is True for pub in author['publications'])) - self.assertEqual(author['public_access']['not_available'], - sum(pub.get('public_access', None) is False for pub in author['publications'])) + self.assertEqual( + author['public_access']['available'], + sum( + pub.get('public_access', None) is True for pub in author['publications'] + ), + ) + self.assertEqual( + author['public_access']['not_available'], + sum( + pub.get('public_access', None) is False + for pub in author['publications'] + ), + ) pub = author['publications'][1] - self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC') + self.assertEqual(pub['author_pub_id'], '4bahYMkAAAAJ:LI9QrySNdTsC') self.assertTrue('5738786554683183717' in pub['cites_id']) scholarly.fill(pub) - self.assertEqual(pub['pub_url'], "https://dl.acm.org/doi/abs/10.1145/3130800.3130815") - mandate = Mandate(agency="US National Science Foundation", effective_date="2016/1", embargo="12 months", - url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - url_policy_cached="/mandates/nsf-2021-02-13.pdf", - grant="BCS-1354029") - self.assertIn(mandate['agency'], [_mandate['agency'] for _mandate in pub['mandates']]) + self.assertEqual( + pub['pub_url'], 'https://dl.acm.org/doi/abs/10.1145/3130800.3130815' + ) + mandate = Mandate( + agency='US National Science Foundation', + effective_date='2016/1', + embargo='12 months', + url_policy='https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf', + url_policy_cached='/mandates/nsf-2021-02-13.pdf', + grant='BCS-1354029', + ) + self.assertIn( + mandate['agency'], [_mandate['agency'] for _mandate in pub['mandates']] + ) # Trigger the pprint method, but suppress the output with self.suppress_stdout(): scholarly.pprint(author) @@ -230,56 +251,58 @@ def test_search_author_single_author(self): self.assertGreaterEqual(len(author['coauthors']), 20) if len(author['coauthors']) > 20: self.assertGreaterEqual(len(author['coauthors']), 35) - self.assertTrue('I23YUh8AAAAJ' in [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertTrue( + 'I23YUh8AAAAJ' + in [_coauth['scholar_id'] for _coauth in author['coauthors']] + ) def test_search_author_multiple_authors(self): - """ - As of May 12, 2020 there are at least 24 'Cattanis's listed as authors - and Giordano Cattani is one of them + """As of May 12, 2020 there are at least 24 'Cattanis's listed as authors + and Giordano Cattani is one of them. """ authors = [a['name'] for a in scholarly.search_author('cattani')] self.assertGreaterEqual(len(authors), 24) - self.assertIn(u'Giordano Cattani', authors) + self.assertIn('Giordano Cattani', authors) def test_search_author_id(self): - """ - Test the search by author ID. Marie Skłodowska-Curie's ID is - EmD_lTEAAAAJ and these IDs are permanent + """Test the search by author ID. Marie Skłodowska-Curie's ID is + EmD_lTEAAAAJ and these IDs are permanent. """ author = scholarly.search_author_id('EmD_lTEAAAAJ') - self.assertEqual(author['name'], u'Marie Skłodowska-Curie') - self.assertEqual(author['affiliation'], - u'Institut du radium, University of Paris') + self.assertEqual(author['name'], 'Marie Skłodowska-Curie') + self.assertEqual( + author['affiliation'], 'Institut du radium, University of Paris' + ) def test_search_author_id_filled(self): - """ - Test the search by author ID. Marie Skłodowska-Curie's ID is + """Test the search by author ID. Marie Skłodowska-Curie's ID is EmD_lTEAAAAJ and these IDs are permanent. As of July 2020, Marie Skłodowska-Curie has 1963 citations - on Google Scholar and 179 publications + on Google Scholar and 179 publications. """ author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) - self.assertEqual(author['name'], u'Marie Skłodowska-Curie') - self.assertEqual(author['affiliation'], - u'Institut du radium, University of Paris') + self.assertEqual(author['name'], 'Marie Skłodowska-Curie') + self.assertEqual( + author['affiliation'], 'Institut du radium, University of Paris' + ) self.assertEqual(author['interests'], []) self.assertEqual(author['public_access']['available'], 0) self.assertEqual(author['public_access']['not_available'], 0) self.assertGreaterEqual(author['citedby'], 2090) self.assertGreaterEqual(len(author['publications']), 218) - cpy = {1986:4, 2011: 137, 2018: 100} + cpy = {1986: 4, 2011: 137, 2018: 100} for year, count in cpy.items(): - self.assertEqual(author["cites_per_year"][year], count) + self.assertEqual(author['cites_per_year'][year], count) pub = author['publications'][1] - self.assertEqual(pub["citedby_url"], - "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702") - + self.assertEqual( + pub['citedby_url'], + 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702', + ) def test_extract_author_id_list(self): - ''' - This unit test tests the extraction of the author id field from the html to populate the `author_id` field + """This unit test tests the extraction of the author id field from the html to populate the `author_id` field in the Publication object. - ''' + """ author_html_full = 'SA Cholewiak, GD Love, MS Banks - Journal of vision, 2018 - jov.arvojournals.org' pub_parser = PublicationParser(None) author_id_list = pub_parser._get_author_id_list(author_html_full) @@ -287,29 +310,31 @@ def test_extract_author_id_list(self): self.assertTrue(author_id_list[1] == '3xJXtlwAAAAJ') self.assertTrue(author_id_list[2] == 'Smr99uEAAAAJ') - author_html_partial = "A Bateman, J O'Connell, N Lorenzini, T Gardner… - BMC psychiatry, 2016 - Springer" + author_html_partial = 'A Bateman, J O\'Connell, N Lorenzini, T Gardner… - BMC psychiatry, 2016 - Springer' pub_parser = PublicationParser(None) author_id_list = pub_parser._get_author_id_list(author_html_partial) self.assertTrue(author_id_list[3] == 'TEndP-sAAAAJ') def test_serialiazation(self): - """ - Test that we can serialize the Author and Publication types + """Test that we can serialize the Author and Publication types. Note: JSON converts integer keys to strings, resulting in the years in `cites_per_year` dictionary as `str` type instead of `int`. To ensure consistency with the typing, use `object_hook` option when loading to convert the keys to integers. """ + # Test that a filled Author with unfilled Publication # is serializable. def cpy_decoder(di): """A utility function to convert the keys in `cites_per_year` to `int` type. - This ensures consistency with `CitesPerYear` typing. + This ensures consistency with `CitesPerYear` typing. """ - if "cites_per_year" in di: - di["cites_per_year"] = {int(k): v for k,v in di["cites_per_year"].items()} + if 'cites_per_year' in di: + di['cites_per_year'] = { + int(k): v for k, v in di['cites_per_year'].items() + } return di author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) @@ -324,8 +349,7 @@ def cpy_decoder(di): self.assertEqual(pub, pub_loaded) def test_full_title(self): - """ - Test if the full title of a long title-publication gets retrieved. + """Test if the full title of a long title-publication gets retrieved. The code under test gets executed if: publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY so the long title-publication is taken from an author object. @@ -336,133 +360,168 @@ def test_full_title(self): # Skip this part of the test since u_35RYKgDlwC has vanished from Google Scholar if False: for i in range(len(author['publications'])): - if author['publications'][i]['author_pub_id'] == 'Xxjj6IsAAAAJ:u_35RYKgDlwC': + if ( + author['publications'][i]['author_pub_id'] + == 'Xxjj6IsAAAAJ:u_35RYKgDlwC' + ): pub_index = i self.assertGreaterEqual(i, 0) # elided title - self.assertEqual(author['publications'][pub_index]['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + author['publications'][pub_index]['bib']['title'], + 'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …', + ) # full text pub = scholarly.fill(author['publications'][pub_index]) - self.assertEqual(pub['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation') + self.assertEqual( + pub['bib']['title'], + 'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation', + ) - self.assertEqual(pub['bib']['citation'], "") + self.assertEqual(pub['bib']['citation'], '') for i in range(len(author['publications'])): - if author['publications'][i]['author_pub_id'] == 'Xxjj6IsAAAAJ:ldfaerwXgEUC': + if ( + author['publications'][i]['author_pub_id'] + == 'Xxjj6IsAAAAJ:ldfaerwXgEUC' + ): pub_index = i self.assertGreaterEqual(i, 0) # elided title - self.assertEqual(author['publications'][pub_index]['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + author['publications'][pub_index]['bib']['title'], + 'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …', + ) # full text pub = scholarly.fill(author['publications'][pub_index]) - self.assertEqual(pub['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + pub['bib']['title'], + 'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …', + ) - self.assertEqual(pub['bib']['citation'], "Journal of Fisheries and Life Sciences 5 (2), 74-84, 2020") + self.assertEqual( + pub['bib']['citation'], + 'Journal of Fisheries and Life Sciences 5 (2), 74-84, 2020', + ) def test_author_organization(self): - """ - """ + """ """ organization_id = 4836318610601440500 # Princeton University - organizations = scholarly.search_org("Princeton University") + organizations = scholarly.search_org('Princeton University') self.assertEqual(len(organizations), 1) organization = organizations[0] - self.assertEqual(organization['Organization'], "Princeton University") + self.assertEqual(organization['Organization'], 'Princeton University') self.assertEqual(organization['id'], str(organization_id)) search_query = scholarly.search_author_by_organization(organization_id) author = next(search_query) - self.assertEqual(author['scholar_id'], "ImhakoAAAAAJ") - self.assertEqual(author['name'], "Daniel Kahneman") - self.assertEqual(author['email_domain'], "@princeton.edu") - self.assertEqual(author['affiliation'], "Princeton University (Emeritus)") + self.assertEqual(author['scholar_id'], 'ImhakoAAAAAJ') + self.assertEqual(author['name'], 'Daniel Kahneman') + self.assertEqual(author['email_domain'], '@princeton.edu') + self.assertEqual(author['affiliation'], 'Princeton University (Emeritus)') self.assertGreaterEqual(author['citedby'], 438891) def test_coauthors(self): - """ - Test that we can fetch long (20+) and short list of coauthors - """ + """Test that we can fetch long (20+) and short list of coauthors.""" author = scholarly.search_author_id('7Jl3PIoAAAAJ') scholarly.fill(author, sections=['basics', 'coauthors']) - self.assertEqual(author['name'], "Victor Silva") + self.assertEqual(author['name'], 'Victor Silva') self.assertLessEqual(len(author['coauthors']), 20) # If the above assertion fails, pick a different author profile self.assertGreaterEqual(len(author['coauthors']), 6) - self.assertIn('Eleni Stroulia', [_coauth['name'] for _coauth in author['coauthors']]) - self.assertIn('TyM1dLwAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertIn( + 'Eleni Stroulia', [_coauth['name'] for _coauth in author['coauthors']] + ) + self.assertIn( + 'TyM1dLwAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']] + ) # Fill co-authors for _coauth in author['coauthors']: scholarly.fill(_coauth, sections=['basics']) - self.assertIn(16627554827500071773, [_coauth.get('organization', None) for _coauth in author['coauthors']]) + self.assertIn( + 16627554827500071773, + [_coauth.get('organization', None) for _coauth in author['coauthors']], + ) author = scholarly.search_author_id('PA9La6oAAAAJ') scholarly.fill(author, sections=['basics', 'coauthors']) - self.assertEqual(author['name'], "Panos Ipeirotis") + self.assertEqual(author['name'], 'Panos Ipeirotis') self.assertGreaterEqual(len(author['coauthors']), 66) # Break the build if the long list cannot be fetched. - self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']]) - self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertIn( + 'Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']] + ) + self.assertIn( + 'hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']] + ) def test_public_access(self): - """ - Test that we obtain public access information + """Test that we obtain public access information. We check two cases: 1) when number of public access mandates exceeds 100, thus requiring fetching information from a second page and 2) fill public access counts without fetching publications. """ - author = scholarly.search_author_id("f4KlrXIAAAAJ") + author = scholarly.search_author_id('f4KlrXIAAAAJ') scholarly.fill(author, sections=['basics', 'public_access', 'publications']) - self.assertGreaterEqual(author["public_access"]["available"], 1150) - self.assertEqual(author["public_access"]["available"], - sum(pub.get("public_access", None) is True for pub in author["publications"])) - self.assertEqual(author["public_access"]["not_available"], - sum(pub.get("public_access", None) is False for pub in author["publications"])) - - author = next(scholarly.search_author("Daniel Kahneman")) - self.assertEqual(author["scholar_id"], "ImhakoAAAAAJ") - self.assertEqual(author["interests"], []) - scholarly.fill(author, sections=["public_access"]) - self.assertGreaterEqual(author["public_access"]["available"], 5) + self.assertGreaterEqual(author['public_access']['available'], 1150) + self.assertEqual( + author['public_access']['available'], + sum( + pub.get('public_access', None) is True for pub in author['publications'] + ), + ) + self.assertEqual( + author['public_access']['not_available'], + sum( + pub.get('public_access', None) is False + for pub in author['publications'] + ), + ) + + author = next(scholarly.search_author('Daniel Kahneman')) + self.assertEqual(author['scholar_id'], 'ImhakoAAAAAJ') + self.assertEqual(author['interests'], []) + scholarly.fill(author, sections=['public_access']) + self.assertGreaterEqual(author['public_access']['available'], 5) def test_mandates(self): - """ - Test that we can fetch the funding information of a paper from an author - """ - author = scholarly.search_author_id("kUDCLXAAAAAJ") + """Test that we can fetch the funding information of a paper from an author.""" + author = scholarly.search_author_id('kUDCLXAAAAAJ') scholarly.fill(author, sections=['public_access', 'publications']) for pub in author['publications']: - if pub['author_pub_id'] == "kUDCLXAAAAAJ:tzM49s52ZIMC": + if pub['author_pub_id'] == 'kUDCLXAAAAAJ:tzM49s52ZIMC': scholarly.fill(pub) break # The hard-coded reference mandate may need regular updates. - mandate = Mandate(agency="European Commission", effective_date="2013/12", embargo="6 months", grant="647112", - url_policy="https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf", - url_policy_cached="/mandates/horizon2020_eu-2021-02-13-en.pdf", + mandate = Mandate( + agency='European Commission', + effective_date='2013/12', + embargo='6 months', + grant='647112', + url_policy='https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf', + url_policy_cached='/mandates/horizon2020_eu-2021-02-13-en.pdf', ) self.assertIn(mandate, pub['mandates']) def test_author_custom_url(self): - """ - Test that we can use custom URLs for retrieving author data - """ - query_url = "/citations?hl=en&view_op=search_authors&mauthors=label%3A3d_shape" + """Test that we can use custom URLs for retrieving author data.""" + query_url = '/citations?hl=en&view_op=search_authors&mauthors=label%3A3d_shape' authors = scholarly.search_author_custom_url(query_url) - self.assertIn(u'Steven A. Cholewiak, PhD', [author['name'] for author in authors]) + self.assertIn( + 'Steven A. Cholewiak, PhD', [author['name'] for author in authors] + ) - @unittest.skipIf(sys.platform.startswith("win"), reason="File read is empty in Windows") + @unittest.skipIf( + sys.platform.startswith('win'), reason='File read is empty in Windows' + ) def test_download_mandates_csv(self): - """ - Test that we can download the mandates CSV and read it. - """ + """Test that we can download the mandates CSV and read it.""" if not os.path.exists(self.mandates_filename): text = scholarly.download_mandates_csv(self.mandates_filename) self.assertGreater(len(text), 0) funder, policy, percentage2020, percentageOverall = [], [], [], [] - with open(self.mandates_filename, "r") as f: + with open(self.mandates_filename) as f: csv_reader = csv.DictReader(f) for row in csv_reader: funder.append(row['\ufeffFunder']) @@ -471,85 +530,108 @@ def test_download_mandates_csv(self): percentageOverall.append(row['Overall']) agency_policy = { - "US National Science Foundation": "https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - "Department of Science & Technology, India": "http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf", - "Swedish Research Council": "https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "" + 'US National Science Foundation': 'https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf', + 'Department of Science & Technology, India': 'http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf', + 'Swedish Research Council': 'https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html', + 'Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning': '', } agency_2020 = { - "US National Science Foundation": "87%", - "Department of Science & Technology, India": "49%", - "Swedish Research Council": "89%", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "88%" + 'US National Science Foundation': '87%', + 'Department of Science & Technology, India': '49%', + 'Swedish Research Council': '89%', + 'Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning': '88%', } - response = requests.get("https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en") - soup = BeautifulSoup(response.text, "html.parser") - agency_overall = soup.find_all("td", class_="gsc_mlt_n gsc_mlt_bd") + response = requests.get( + 'https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en' + ) + soup = BeautifulSoup(response.text, 'html.parser') + agency_overall = soup.find_all('td', class_='gsc_mlt_n gsc_mlt_bd') # These hardcoded numbers need some regular updates. - for agency, index in zip(agency_policy, [5-1,9-1, 21-1, 63-1]): + for agency, index in zip(agency_policy, [5 - 1, 9 - 1, 21 - 1, 63 - 1]): agency_index = funder.index(agency) self.assertEqual(policy[agency_index], agency_policy[agency]) # Check that the percentage values from CSV and on the page agree. - self.assertEqual(percentageOverall[agency_index], agency_overall[index].text) + self.assertEqual( + percentageOverall[agency_index], agency_overall[index].text + ) # The percentage fluctuates, so we can't check the exact value. - self.assertAlmostEqual(int(percentage2020[agency_index][:-1]), int(agency_2020[agency][:-1]), delta=2) - - @unittest.skipIf(sys.platform.startswith("win"), reason="File read is empty in Windows") - @unittest.skipIf(pd is None, reason="pandas is not installed") + self.assertAlmostEqual( + int(percentage2020[agency_index][:-1]), + int(agency_2020[agency][:-1]), + delta=2, + ) + + @unittest.skipIf( + sys.platform.startswith('win'), reason='File read is empty in Windows' + ) + @unittest.skipIf(pd is None, reason='pandas is not installed') def test_download_mandates_csv_with_pandas(self): - """ - Test that we can use pandas to read the CSV file - """ + """Test that we can use pandas to read the CSV file.""" if not os.path.exists(self.mandates_filename): text = scholarly.download_mandates_csv(self.mandates_filename) self.assertGreater(len(text), 0) - df = pd.read_csv(self.mandates_filename, usecols=["Funder", "Policy", "2020", "Overall"]).fillna("") + df = pd.read_csv( + self.mandates_filename, usecols=['Funder', 'Policy', '2020', 'Overall'] + ).fillna('') self.assertGreater(len(df), 0) - funders = ["US National Science Foundation", - "Department of Science & Technology, India", - "Swedish Research Council", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning" - ] - - policies = ["https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - "http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf", - "https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html", - "" - ] + funders = [ + 'US National Science Foundation', + 'Department of Science & Technology, India', + 'Swedish Research Council', + 'Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning', + ] + + policies = [ + 'https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf', + 'http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf', + 'https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html', + '', + ] percentage_overall = [84, 54, 83, 83] percentage_2020 = [87, 49, 89, 88] - rows = df["Funder"].isin(funders) + rows = df['Funder'].isin(funders) self.assertEqual(rows.sum(), 4) - self.assertEqual(df["Policy"][rows].tolist(), policies) - df_overall = df["Overall"][rows].tolist() - df_2020 = df["2020"][rows].tolist() + self.assertEqual(df['Policy'][rows].tolist(), policies) + df_overall = df['Overall'][rows].tolist() + df_2020 = df['2020'][rows].tolist() for idx in range(4): - self.assertAlmostEqual(int(df_overall[idx][:-1]), percentage_overall[idx], delta=2) - self.assertAlmostEqual(int(df_2020[idx][:-1]), percentage_2020[idx], delta=2) + self.assertAlmostEqual( + int(df_overall[idx][:-1]), percentage_overall[idx], delta=2 + ) + self.assertAlmostEqual( + int(df_2020[idx][:-1]), percentage_2020[idx], delta=2 + ) def test_save_journal_leaderboard(self): - """ - Test that we can save the journal leaderboard to a file - """ - filename = "journals.csv" + """Test that we can save the journal leaderboard to a file.""" + filename = 'journals.csv' while os.path.exists(filename): - filename = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + ".csv" + filename = ( + ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + '.csv' + ) try: - scholarly.save_journals_csv(category="Physics & Mathematics", subcategory="Astronomy & Astrophysics", - filename=filename, include_comments=True) - with open(filename, "r") as f: + scholarly.save_journals_csv( + category='Physics & Mathematics', + subcategory='Astronomy & Astrophysics', + filename=filename, + include_comments=True, + ) + with open(filename) as f: csv_reader = csv.DictReader(f) for row in csv_reader: # These hard-coded values need regular updates. self.assertEqual(row['Publication'], 'The Astrophysical Journal') self.assertEqual(row['h5-index'], '167') self.assertEqual(row['h5-median'], '234') - self.assertEqual(row['Comment'], '#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ') + self.assertEqual( + row['Comment'], + '#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ', + ) break finally: if os.path.exists(filename): @@ -557,46 +639,66 @@ def test_save_journal_leaderboard(self): def test_bin_citations_by_year(self): """Test an internal optimization function to bin cites_per_year - while keeping the citation counts less than 1000 per bin. - """ - cpy = {2022: 490, 2021: 340, 2020:327, 2019:298, 2018: 115, 2017: 49, 2016: 20, 2015: 8, 2014: 3, 2013: 1, 2012: 1} + while keeping the citation counts less than 1000 per bin. + """ + cpy = { + 2022: 490, + 2021: 340, + 2020: 327, + 2019: 298, + 2018: 115, + 2017: 49, + 2016: 20, + 2015: 8, + 2014: 3, + 2013: 1, + 2012: 1, + } years = scholarly._bin_citations_by_year(cpy, 2022) for y_hi, y_lo in years: self.assertLessEqual(y_lo, y_hi) - self.assertLessEqual(sum(cpy[y] for y in range(y_lo, y_hi+1)), 1000) + self.assertLessEqual(sum(cpy[y] for y in range(y_lo, y_hi + 1)), 1000) def test_cites_per_year(self): """Test that the cites_per_year is correctly filled in, - including any gap years. + including any gap years. """ author = scholarly.search_author_id('DW_bVcEAAAAJ') scholarly.fill(author, sections=['counts']) - cpy = {2014: 1, 2015: 2, 2016: 2, 2017: 0, 2018: 2, 2019: 0, 2020: 11, 2021: 21, 2022: 37} + cpy = { + 2014: 1, + 2015: 2, + 2016: 2, + 2017: 0, + 2018: 2, + 2019: 0, + 2020: 11, + 2021: 21, + 2022: 37, + } for year, count in cpy.items(): self.assertEqual(author['cites_per_year'][year], count) def test_redirect(self): - """Test that we can handle redirects when the scholar_id is approximate. - """ - author = scholarly.search_author_id("oMaIg8sAAAAJ") - self.assertEqual(author["scholar_id"], "PEJ42J0AAAAJ") - scholarly.fill(author, sections=["basics"]) - self.assertEqual(author["name"], "Kiran Bhatia") - self.assertGreaterEqual(author["citedby"], 135) + """Test that we can handle redirects when the scholar_id is approximate.""" + author = scholarly.search_author_id('oMaIg8sAAAAJ') + self.assertEqual(author['scholar_id'], 'PEJ42J0AAAAJ') + scholarly.fill(author, sections=['basics']) + self.assertEqual(author['name'], 'Kiran Bhatia') + self.assertGreaterEqual(author['citedby'], 135) + class TestScholarlyWithProxy(unittest.TestCase): @classmethod def setUpClass(cls): - """ - Setup the proxy methods for unit tests - """ + """Setup the proxy methods for unit tests.""" scholarly.set_timeout(5) scholarly.set_retries(5) - if "CONNECTION_METHOD" in scholarly.env: - cls.connection_method = os.getenv("CONNECTION_METHOD") + if 'CONNECTION_METHOD' in scholarly.env: + cls.connection_method = os.getenv('CONNECTION_METHOD') else: - cls.connection_method = "none" + cls.connection_method = 'none' scholarly.use_proxy(None) return @@ -605,43 +707,44 @@ def setUpClass(cls): secondary_proxy_generator.FreeProxies() proxy_generator = ProxyGenerator() - if cls.connection_method == "tor": - tor_password = "scholarly_password" + if cls.connection_method == 'tor': + tor_password = 'scholarly_password' # Tor uses the 9050 port as the default socks port # on windows 9150 for socks and 9151 for control - if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): + if sys.platform.startswith('linux') or sys.platform.startswith('darwin'): tor_sock_port = 9050 tor_control_port = 9051 - elif sys.platform.startswith("win"): + elif sys.platform.startswith('win'): tor_sock_port = 9150 tor_control_port = 9151 else: tor_sock_port = None tor_control_port = None - proxy_generator.Tor_External(tor_sock_port, tor_control_port, - tor_password) + proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) - elif cls.connection_method == "tor_internal": - if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): + elif cls.connection_method == 'tor_internal': + if sys.platform.startswith('linux') or sys.platform.startswith('darwin'): tor_cmd = 'tor' - elif sys.platform.startswith("win"): + elif sys.platform.startswith('win'): tor_cmd = 'tor.exe' else: tor_cmd = None - proxy_generator.Tor_Internal(tor_cmd = tor_cmd) + proxy_generator.Tor_Internal(tor_cmd=tor_cmd) - elif cls.connection_method == "luminati": + elif cls.connection_method == 'luminati': scholarly.set_retries(10) - proxy_generator.Luminati(usr=os.getenv("USERNAME"), - passwd=os.getenv("PASSWORD"), - proxy_port=os.getenv("PORT")) + proxy_generator.Luminati( + usr=os.getenv('USERNAME'), + passwd=os.getenv('PASSWORD'), + proxy_port=os.getenv('PORT'), + ) - elif cls.connection_method == "freeproxy": + elif cls.connection_method == 'freeproxy': # Use different instances for primary and secondary proxy_generator = ProxyGenerator() proxy_generator.FreeProxies() - elif cls.connection_method == "scraperapi": + elif cls.connection_method == 'scraperapi': proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) else: @@ -650,15 +753,12 @@ def setUpClass(cls): scholarly.use_proxy(proxy_generator, secondary_proxy_generator) def test_search_pubs_empty_publication(self): - """ - Test that searching for an empty publication returns zero results - """ - pubs = [p for p in scholarly.search_pubs('')] + """Test that searching for an empty publication returns zero results.""" + pubs = list(scholarly.search_pubs('')) self.assertIs(len(pubs), 0) def test_search_pubs_citedby(self): - """ - Testing that when we retrieve the list of publications that cite + """Testing that when we retrieve the list of publications that cite a publication, the number of citing publication is the same as the number of papers that are returned. We use a publication with a small number of citations, so that the test runs quickly. @@ -666,15 +766,14 @@ def test_search_pubs_citedby(self): June 1, 2020. """ query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale' - pubs = [p for p in scholarly.search_pubs(query)] + pubs = list(scholarly.search_pubs(query)) self.assertGreaterEqual(len(pubs), 1) filled = scholarly.fill(pubs[0]) - cites = [c for c in scholarly.citedby(filled)] + cites = list(scholarly.citedby(filled)) self.assertEqual(len(cites), filled['num_citations']) def test_search_pubs_citedby_id(self): - """ - Test querying for citations by paper ID. + """Test querying for citations by paper ID. The 'Machine-learned epidemiology' paper had 11 citations as of June 1, 2020. @@ -682,62 +781,68 @@ def test_search_pubs_citedby_id(self): # Machine-learned epidemiology: real-time detection of foodborne illness at scale publication_id = 2244396665447968936 - pubs = [p for p in scholarly.search_citedby(publication_id)] + pubs = list(scholarly.search_citedby(publication_id)) self.assertGreaterEqual(len(pubs), 11) def test_bibtex(self): - """ - Test that we get the BiBTeX entry correctly - """ - - with open("testdata/bibtex.txt", "r") as f: - expected_result = "".join(f.readlines()) - - pub = scholarly.search_single_pub("A distribution-based clustering algorithm for mining in large " - "spatial databases", filled=True) + """Test that we get the BiBTeX entry correctly.""" + with open('testdata/bibtex.txt') as f: + expected_result = ''.join(f.readlines()) + + pub = scholarly.search_single_pub( + 'A distribution-based clustering algorithm for mining in large ' + 'spatial databases', + filled=True, + ) result = scholarly.bibtex(pub) self.assertEqual(result, expected_result) def test_search_pubs(self): - """ - As of May 12, 2020 there are at least 29 pubs that fit the search term: + """As of May 12, 2020 there are at least 29 pubs that fit the search term: ["naive physics" stability "3d shape"]. Check that the paper "Visual perception of the physical stability of asymmetric three-dimensional objects" is among them """ - pub = scholarly.search_single_pub("naive physics stability 3d shape") + pub = scholarly.search_single_pub('naive physics stability 3d shape') pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in ('author_id', 'pub_url', 'num_citations'): self.assertEqual(pub[key], pubs[0][key]) - for key in {'title', 'pub_year', 'venue'}: + for key in ('title', 'pub_year', 'venue'): self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) self.assertGreaterEqual(len(pubs), 27) titles = [p['bib']['title'] for p in pubs] - self.assertIn('Visual perception of the physical stability of asymmetric three-dimensional objects', titles) + self.assertIn( + 'Visual perception of the physical stability of asymmetric three-dimensional objects', + titles, + ) def test_search_pubs_single_pub(self): - """ - As of Jun 24, 2024 there are is only one pub that fits the search term: + """As of Jun 24, 2024 there are is only one pub that fits the search term: [Perception of physical stability and center of mass of 3D objects]. Check that it returns a proper result and the total results for that search term is equal to 1. """ - pub = scholarly.search_single_pub("Perception of physical stability and center of mass of 3D objects") - pubs = list(scholarly.search_pubs("Perception of physical stability and center of mass of 3D objects")) + pub = scholarly.search_single_pub( + 'Perception of physical stability and center of mass of 3D objects' + ) + pubs = list( + scholarly.search_pubs( + 'Perception of physical stability and center of mass of 3D objects' + ) + ) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in ('author_id', 'pub_url', 'num_citations'): self.assertEqual(pub[key], pubs[0][key]) - for key in {'title', 'pub_year', 'venue'}: + for key in ('title', 'pub_year', 'venue'): self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) self.assertEqual(len(pubs), 1) def test_search_pubs_total_results(self): - """ - As of September 16, 2021 there are 32 pubs that fit the search term: + """As of September 16, 2021 there are 32 pubs that fit the search term: ["naive physics" stability "3d shape"], and 17'000 results that fit the search term ["WIEN2k Blaha"] and none for ["sdfsdf+24r+asdfasdf"]. @@ -753,128 +858,157 @@ def test_search_pubs_total_results(self): self.assertEqual(pubs.total_results, 0) def test_search_pubs_filling_publication_contents(self): - ''' - This process checks the process of filling a publication that is derived - from the search publication snippets. - ''' + """This process checks the process of filling a publication that is derived + from the search publication snippets. + """ query = 'Creating correct blur and its effect on accommodation' results = scholarly.search_pubs(query) - pubs = [p for p in results] + pubs = list(results) self.assertGreaterEqual(len(pubs), 1) f = scholarly.fill(pubs[0]) - self.assertTrue(f['bib']['author'] == u'Cholewiak, Steven A and Love, Gordon D and Banks, Martin S') - self.assertTrue(f['author_id'] == ['4bahYMkAAAAJ', '3xJXtlwAAAAJ', 'Smr99uEAAAAJ']) - self.assertTrue(f['bib']['journal'] == u'Journal of Vision') + self.assertTrue( + f['bib']['author'] + == 'Cholewiak, Steven A and Love, Gordon D and Banks, Martin S' + ) + self.assertTrue( + f['author_id'] == ['4bahYMkAAAAJ', '3xJXtlwAAAAJ', 'Smr99uEAAAAJ'] + ) + self.assertTrue(f['bib']['journal'] == 'Journal of Vision') self.assertTrue(f['bib']['number'] == '9') - self.assertTrue(f['bib']['pages'] == u'1--1') - self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') - self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') - self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') + self.assertTrue(f['bib']['pages'] == '1--1') + self.assertTrue( + f['bib']['publisher'] + == 'The Association for Research in Vision and Ophthalmology' + ) + self.assertTrue( + f['bib']['title'] == 'Creating correct blur and its effect on accommodation' + ) + self.assertTrue( + f['pub_url'] + == 'https://jov.arvojournals.org/article.aspx?articleid=2701817' + ) self.assertTrue(f['bib']['volume'] == '18') - self.assertTrue(f['bib']['pub_year'] == u'2018') + self.assertTrue(f['bib']['pub_year'] == '2018') def test_related_articles_from_author(self): - """ - Test that we obtain related articles to an article from an author - """ - author = scholarly.search_author_id("ImhakoAAAAAJ") + """Test that we obtain related articles to an article from an author.""" + author = scholarly.search_author_id('ImhakoAAAAAJ') scholarly.fill(author, sections=['basics', 'publications']) pub = author['publications'][0] - self.assertEqual(pub['bib']['title'], 'Prospect theory: An analysis of decision under risk') - self.assertEqual(pub['bib']['citation'], 'Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013') + self.assertEqual( + pub['bib']['title'], 'Prospect theory: An analysis of decision under risk' + ) + self.assertEqual( + pub['bib']['citation'], + 'Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013', + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - self.assertEqual(pub["pub_url"], same_article["pub_url"]) - for key in {'title', 'pub_year'}: + self.assertEqual(pub['pub_url'], same_article['pub_url']) + for key in ('title', 'pub_year'): self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Advances in prospect theory: Cumulative representation of uncertainty') + self.assertEqual( + related_article['bib']['title'], + 'Advances in prospect theory: Cumulative representation of uncertainty', + ) self.assertEqual(related_article['bib']['pub_year'], '1992') self.assertGreaterEqual(related_article['num_citations'], 18673) - self.assertIn("A Tversky", related_article['bib']['author']) + self.assertIn('A Tversky', related_article['bib']['author']) def test_related_articles_from_publication(self): - """ - Test that we obtain related articles to an article from a search - """ - pub = scholarly.search_single_pub("Planck 2018 results-VI. Cosmological parameters") + """Test that we obtain related articles to an article from a search.""" + pub = scholarly.search_single_pub( + 'Planck 2018 results-VI. Cosmological parameters' + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in ('author_id', 'pub_url', 'num_citations'): self.assertEqual(pub[key], same_article[key]) - for key in {'title', 'pub_year'}: + for key in ('title', 'pub_year'): self.assertEqual(pub['bib'][key], same_article['bib'][key]) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Large Magellanic Cloud Cepheid standards provide ' - 'a 1% foundation for the determination of the Hubble constant and stronger evidence ' - 'for physics beyond ΛCDM') + self.assertEqual( + related_article['bib']['title'], + 'Large Magellanic Cloud Cepheid standards provide ' + 'a 1% foundation for the determination of the Hubble constant and stronger evidence ' + 'for physics beyond ΛCDM', + ) self.assertEqual(related_article['bib']['pub_year'], '2019') self.assertGreaterEqual(related_article['num_citations'], 1388) - self.assertIn("AG Riess", related_article['bib']['author']) + self.assertIn('AG Riess', related_article['bib']['author']) def test_pubs_custom_url(self): - """ - Test that we can use custom URLs for retrieving publication data - """ - query_url = ('/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' - 'as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31') + """Test that we can use custom URLs for retrieving publication data.""" + query_url = ( + '/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' + 'as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31' + ) pubs = scholarly.search_pubs_custom_url(query_url) pub = next(pubs) - self.assertEqual(pub['bib']['title'], 'Quantitation and mapping of tissue optical properties using modulated imaging') - self.assertEqual(set(pub['author_id']), {'V-ab9U4AAAAJ', '4k-k6SEAAAAJ', 'GLm-SaQAAAAJ'}) + self.assertEqual( + pub['bib']['title'], + 'Quantitation and mapping of tissue optical properties using modulated imaging', + ) + self.assertEqual( + set(pub['author_id']), {'V-ab9U4AAAAJ', '4k-k6SEAAAAJ', 'GLm-SaQAAAAJ'} + ) self.assertEqual(pub['bib']['pub_year'], '2009') self.assertGreaterEqual(pub['num_citations'], 581) def check_citedby_1k(self, pub): - """A common checking method to check - """ - original_citation_count = pub["num_citations"] + """A common checking method to check.""" + original_citation_count = pub['num_citations'] # Trigger a different code path if original_citation_count <= 1000: - pub["num_citations"] = 1001 + pub['num_citations'] = 1001 citations = scholarly.citedby(pub) citation_list = list(citations) self.assertEqual(len(citation_list), original_citation_count) return citation_list def test_citedby_1k_citations(self): - """Test that scholarly can fetch 1000+ citations from an author - """ + """Test that scholarly can fetch 1000+ citations from an author.""" author = scholarly.search_author_id('QoX9bu8AAAAJ') scholarly.fill(author, sections=['publications']) - pub = [_p for _p in author['publications'] if _p["author_pub_id"]=="QoX9bu8AAAAJ:L8Ckcad2t8MC"][0] + pub = next( + _p + for _p in author['publications'] + if _p['author_pub_id'] == 'QoX9bu8AAAAJ:L8Ckcad2t8MC' + ) scholarly.fill(pub) citation_list = self.check_citedby_1k(pub) - yearwise_counter = Counter([c["bib"]["pub_year"] for c in citation_list]) - for year, count in pub["cites_per_year"].items(): + yearwise_counter = Counter([c['bib']['pub_year'] for c in citation_list]) + for year, count in pub['cites_per_year'].items(): self.assertEqual(yearwise_counter.get(str(year), 0), count) def test_citedby_1k_scholar(self): - """Test that scholarly can fetch 1000+ citations from a pub search. - """ - title = "Persistent entanglement in a class of eigenstates of quantum Heisenberg spin glasses" + """Test that scholarly can fetch 1000+ citations from a pub search.""" + title = 'Persistent entanglement in a class of eigenstates of quantum Heisenberg spin glasses' pubs = scholarly.search_pubs(title) pub = next(pubs) self.check_citedby_1k(pub) def test_citedby(self): - """Test that we can search citations of a paper from author's profile. - """ + """Test that we can search citations of a paper from author's profile.""" # Retrieve the author's data, fill-in, and print search_query = scholarly.search_author('Steven A Cholewiak') author = scholarly.fill(next(search_query)) pub = scholarly.fill(author['publications'][0]) # Which papers cited that publication? - top10_citations = [citation for num, citation in enumerate(scholarly.citedby(pub)) if num<10] + top10_citations = [ + citation for num, citation in enumerate(scholarly.citedby(pub)) if num < 10 + ] self.assertEqual(len(top10_citations), 10) + if __name__ == '__main__': unittest.main()