From c019a23e93fc9e19fc7abedbde999a35202b8d94 Mon Sep 17 00:00:00 2001 From: snow-fox Date: Tue, 18 Oct 2022 12:43:34 +0100 Subject: [PATCH 1/5] add pre-commit --- .github/workflows/pre-commit.yml | 22 + .pre-commit-config.yaml | 60 +++ CHANGELOG.md | 6 +- README.md | 304 ++++++------- docs/conf.py | 36 +- scholarly/__init__.py | 3 +- scholarly/_navigator.py | 206 +++++---- scholarly/_proxy_generator.py | 285 ++++++++----- scholarly/_scholarly.py | 370 +++++++++------- scholarly/author_parser.py | 339 ++++++++------- scholarly/data_types.py | 68 ++- scholarly/publication_parser.py | 481 +++++++++++---------- setup.py | 63 +-- test_module.py | 710 +++++++++++++++++++------------ 14 files changed, 1727 insertions(+), 1226 deletions(-) create mode 100644 .github/workflows/pre-commit.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..26515d79 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,22 @@ +# https://pre-commit.com +# This GitHub Action assumes that the repo contains a valid .pre-commit-config.yaml file. +--- +name: pre-commit +on: + pull_request: + push: + branches: [master] + +permissions: + contents: read + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - run: pip install pre-commit + - run: pre-commit --version + - run: pre-commit install + - run: pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0910eef8 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,60 @@ +--- +repos: + - repo: https://github.com/python/black + rev: 22.8.0 + hooks: + - id: black + - repo: https://github.com/codespell-project/codespell + rev: v2.2.1 + hooks: + - id: codespell + args: + - --skip=*.css,*.js,*.map,*.scss,*svg + - repo: https://gitlab.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: + - '--per-file-ignores=*/__init__.py:F401 test/all_parameter_combs_test.py:F405 pettingzoo/classic/go/go.py:W605' + - --extend-ignore=E203 + - --max-complexity=205 + - --max-line-length=300 + - --show-source + - --statistics + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + args: ["--profile", "black"] + - repo: https://github.com/asottile/pyupgrade + rev: v2.38.0 + hooks: + - id: pyupgrade + # TODO: remove `--keep-runtime-typing` option + args: ["--py37-plus", "--keep-runtime-typing"] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/pycqa/pydocstyle + rev: 6.1.1 + hooks: + - id: pydocstyle + args: + - --source + - --explain + - --convention=google + - --count + - --add-ignore=D100,D107 + exclude: "__init__.py$|^docs" + additional_dependencies: ["toml"] + # - repo: local + # hooks: + # - id: pyright + # name: pyright + # entry: pyright + # language: node + # pass_filenames: false + # types: [python] + # additional_dependencies: ["pyright"] diff --git a/CHANGELOG.md b/CHANGELOG.md index b5932adf..1bfbb0c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ ### Bugfixes - Fix pprint failures on Windows #413. - Thoroughly handle 1000 or more publications that are available (or not) according to public access mandates #414. -- Fix errors in `download_mandates_csv` that may occassionally occur for agencies without a policy link #413. +- Fix errors in `download_mandates_csv` that may occasionally occur for agencies without a policy link #413. ## Changes in v1.6.3 @@ -35,7 +35,7 @@ ### Features - Download table of funding agencies as a CSV file with URL to the funding mandates included -- Downlad top-ranking journals in general, under sub-categories and in different languages as a CSV file +- Download top-ranking journals in general, under sub-categories and in different languages as a CSV file ### Bugfixes - #392 @@ -58,7 +58,7 @@ ## Changes in v1.5.0 ### Features - Fetch the public access mandates information from a Scholar profile and mark the publications whether or not they satisfy the open-access mandate. -- Fetch an author's organization identifer from their Scholar profile +- Fetch an author's organization identifier from their Scholar profile - Search for all authors affiliated with an organization - Fetch homepage URL from a Scholar profile ### Enhancements diff --git a/README.md b/README.md index 9e13dcc8..2401d5b6 100644 --- a/README.md +++ b/README.md @@ -1,152 +1,152 @@ -[![Python package](https://github.com/scholarly-python-package/scholarly/workflows/Python%20package/badge.svg?branch=main)](https://github.com/scholarly-python-package/scholarly/actions?query=branch%3Amain) -[![codecov](https://codecov.io/gh/scholarly-python-package/scholarly/branch/main/graph/badge.svg?token=0svtI9yVSQ)](https://codecov.io/gh/scholarly-python-package/scholarly) -[![Documentation Status](https://readthedocs.org/projects/scholarly/badge/?version=latest)](https://scholarly.readthedocs.io/en/latest/?badge=latest) -[![DOI](https://zenodo.org/badge/27442991.svg)](https://zenodo.org/badge/latestdoi/27442991) - -# scholarly - -scholarly is a module that allows you to retrieve author and publication information from [Google Scholar](https://scholar.google.com) in a friendly, Pythonic way without having to solve CAPTCHAs. - -## Installation - -[![Anaconda-Server Badge](https://anaconda.org/conda-forge/scholarly/badges/installer/conda.svg)](https://conda.anaconda.org/conda-forge) -[![PyPI version](https://badge.fury.io/py/scholarly.svg)](https://badge.fury.io/py/scholarly) - -`scholarly` can be installed either with `conda` or with `pip`. -To install using `conda`, simply run -```bash -conda install -c conda-forge scholarly -``` - -Alternatively, use `pip` to install the latest release from pypi: - -```bash -pip3 install scholarly -``` - -or `pip` to install from github: - -```bash -pip3 install -U git+https://github.com/scholarly-python-package/scholarly.git -``` - -We are constantly developing new features. -Please update your local package regularly. -`scholarly` follows [Semantic Versioning](https://semver.org/). -This means your code that uses an earlier version of `scholarly` is guaranteed to work with newer versions. - -### Optional dependencies - -- **Tor**: - - `scholarly` comes with a handful of APIs to set up proxies to circumvent anti-bot measures. - Tor methods are deprecated since v1.5 and are not actively tested or supported. - If you wish to use Tor, install `scholarly` using the `tor` tag as - ```bash - pip3 install scholarly[tor] - ``` - If you use `zsh` (which is now the default in latest macOS), you should type this as - ```zsh - pip3 install scholarly'[tor]' - ``` - **Note:** Tor option is unavailable with conda installation. - -## Tests - -To check if your installation is succesful, run the tests by executing the `test_module.py` file as: - -```bash -python3 test_module -``` - -or - -```bash -python3 -m unittest -v test_module.py -``` -## Documentation - -Check the [documentation](https://scholarly.readthedocs.io/en/latest/?badge=latest) for a [complete API reference](https://scholarly.readthedocs.io/en/stable/scholarly.html) and a [quickstart guide](https://scholarly.readthedocs.io/en/stable/quickstart.html). - -### Examples - -```python -from scholarly import scholarly - -# Retrieve the author's data, fill-in, and print -# Get an iterator for the author results -search_query = scholarly.search_author('Steven A Cholewiak') -# Retrieve the first result from the iterator -first_author_result = next(search_query) -scholarly.pprint(first_author_result) - -# Retrieve all the details for the author -author = scholarly.fill(first_author_result ) -scholarly.pprint(author) - -# Take a closer look at the first publication -first_publication = author['publications'][0] -first_publication_filled = scholarly.fill(first_publication) -scholarly.pprint(first_publication_filled) - -# Print the titles of the author's publications -publication_titles = [pub['bib']['title'] for pub in author['publications']] -print(publication_titles) - -# Which papers cited that publication? -citations = [citation['bib']['title'] for citation in scholarly.citedby(first_publication_filled)] -print(citations) -``` - -**IMPORTANT**: Making certain types of queries, such as `scholarly.citedby` or `scholarly.search_pubs`, will lead to Google Scholar blocking your requests and may eventually block your IP address. -You must use proxy services to avoid this situation. -See the ["Using proxies" section](https://scholarly.readthedocs.io/en/stable/quickstart.html#using-proxies) in the documentation for more details. Here's a short example: - -```python -from scholarly import ProxyGenerator - -# Set up a ProxyGenerator object to use free proxies -# This needs to be done only once per session -pg = ProxyGenerator() -pg.FreeProxies() -scholarly.use_proxy(pg) - -# Now search Google Scholar from behind a proxy -search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects') -scholarly.pprint(next(search_query)) -``` - -`scholarly` also has APIs that work with several premium (paid) proxy services. -`scholarly` is smart enough to know which queries need proxies and which do not. -It is therefore recommended to always set up a proxy in the beginning of your application. - -#### Disclaimer - -The developers use `ScraperAPI` to run the tests in Github Actions. -The developers of `scholarly` are not affiliated with any of the proxy services and do not profit from them. If your favorite service is not supported, please submit an issue or even better, follow it up with a pull request. - -## Contributing - -We welcome contributions from you. -Please create an issue, fork this repository and submit a pull request. -Read the [contributing document](.github/CONTRIBUTING.md) for more information. - -## Acknowledging `scholarly` - -If you have used this codebase in a scientific publication, please cite this software as following: - -```bibtex -@software{cholewiak2021scholarly, - author = {Cholewiak, Steven A. and Ipeirotis, Panos and Silva, Victor and Kannawadi, Arun}, - title = {{SCHOLARLY: Simple access to Google Scholar authors and citation using Python}}, - year = {2021}, - doi = {10.5281/zenodo.5764801}, - license = {Unlicense}, - url = {https://github.com/scholarly-python-package/scholarly}, - version = {1.5.1} -} -``` - -## License - -The original code that this project was forked from was released by [Luciano Bello](https://github.com/lbello/chalmers-web) under a [WTFPL](http://www.wtfpl.net/) license. In keeping with this mentality, all code is released under the [Unlicense](http://unlicense.org/). +[![Python package](https://github.com/scholarly-python-package/scholarly/workflows/Python%20package/badge.svg?branch=main)](https://github.com/scholarly-python-package/scholarly/actions?query=branch%3Amain) +[![codecov](https://codecov.io/gh/scholarly-python-package/scholarly/branch/main/graph/badge.svg?token=0svtI9yVSQ)](https://codecov.io/gh/scholarly-python-package/scholarly) +[![Documentation Status](https://readthedocs.org/projects/scholarly/badge/?version=latest)](https://scholarly.readthedocs.io/en/latest/?badge=latest) +[![DOI](https://zenodo.org/badge/27442991.svg)](https://zenodo.org/badge/latestdoi/27442991) + +# scholarly + +scholarly is a module that allows you to retrieve author and publication information from [Google Scholar](https://scholar.google.com) in a friendly, Pythonic way without having to solve CAPTCHAs. + +## Installation + +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/scholarly/badges/installer/conda.svg)](https://conda.anaconda.org/conda-forge) +[![PyPI version](https://badge.fury.io/py/scholarly.svg)](https://badge.fury.io/py/scholarly) + +`scholarly` can be installed either with `conda` or with `pip`. +To install using `conda`, simply run +```bash +conda install -c conda-forge scholarly +``` + +Alternatively, use `pip` to install the latest release from pypi: + +```bash +pip3 install scholarly +``` + +or `pip` to install from github: + +```bash +pip3 install -U git+https://github.com/scholarly-python-package/scholarly.git +``` + +We are constantly developing new features. +Please update your local package regularly. +`scholarly` follows [Semantic Versioning](https://semver.org/). +This means your code that uses an earlier version of `scholarly` is guaranteed to work with newer versions. + +### Optional dependencies + +- **Tor**: + + `scholarly` comes with a handful of APIs to set up proxies to circumvent anti-bot measures. + Tor methods are deprecated since v1.5 and are not actively tested or supported. + If you wish to use Tor, install `scholarly` using the `tor` tag as + ```bash + pip3 install scholarly[tor] + ``` + If you use `zsh` (which is now the default in latest macOS), you should type this as + ```zsh + pip3 install scholarly'[tor]' + ``` + **Note:** Tor option is unavailable with conda installation. + +## Tests + +To check if your installation is successful, run the tests by executing the `test_module.py` file as: + +```bash +python3 test_module +``` + +or + +```bash +python3 -m unittest -v test_module.py +``` +## Documentation + +Check the [documentation](https://scholarly.readthedocs.io/en/latest/?badge=latest) for a [complete API reference](https://scholarly.readthedocs.io/en/stable/scholarly.html) and a [quickstart guide](https://scholarly.readthedocs.io/en/stable/quickstart.html). + +### Examples + +```python +from scholarly import scholarly + +# Retrieve the author's data, fill-in, and print +# Get an iterator for the author results +search_query = scholarly.search_author('Steven A Cholewiak') +# Retrieve the first result from the iterator +first_author_result = next(search_query) +scholarly.pprint(first_author_result) + +# Retrieve all the details for the author +author = scholarly.fill(first_author_result ) +scholarly.pprint(author) + +# Take a closer look at the first publication +first_publication = author['publications'][0] +first_publication_filled = scholarly.fill(first_publication) +scholarly.pprint(first_publication_filled) + +# Print the titles of the author's publications +publication_titles = [pub['bib']['title'] for pub in author['publications']] +print(publication_titles) + +# Which papers cited that publication? +citations = [citation['bib']['title'] for citation in scholarly.citedby(first_publication_filled)] +print(citations) +``` + +**IMPORTANT**: Making certain types of queries, such as `scholarly.citedby` or `scholarly.search_pubs`, will lead to Google Scholar blocking your requests and may eventually block your IP address. +You must use proxy services to avoid this situation. +See the ["Using proxies" section](https://scholarly.readthedocs.io/en/stable/quickstart.html#using-proxies) in the documentation for more details. Here's a short example: + +```python +from scholarly import ProxyGenerator + +# Set up a ProxyGenerator object to use free proxies +# This needs to be done only once per session +pg = ProxyGenerator() +pg.FreeProxies() +scholarly.use_proxy(pg) + +# Now search Google Scholar from behind a proxy +search_query = scholarly.search_pubs('Perception of physical stability and center of mass of 3D objects') +scholarly.pprint(next(search_query)) +``` + +`scholarly` also has APIs that work with several premium (paid) proxy services. +`scholarly` is smart enough to know which queries need proxies and which do not. +It is therefore recommended to always set up a proxy in the beginning of your application. + +#### Disclaimer + +The developers use `ScraperAPI` to run the tests in Github Actions. +The developers of `scholarly` are not affiliated with any of the proxy services and do not profit from them. If your favorite service is not supported, please submit an issue or even better, follow it up with a pull request. + +## Contributing + +We welcome contributions from you. +Please create an issue, fork this repository and submit a pull request. +Read the [contributing document](.github/CONTRIBUTING.md) for more information. + +## Acknowledging `scholarly` + +If you have used this codebase in a scientific publication, please cite this software as following: + +```bibtex +@software{cholewiak2021scholarly, + author = {Cholewiak, Steven A. and Ipeirotis, Panos and Silva, Victor and Kannawadi, Arun}, + title = {{SCHOLARLY: Simple access to Google Scholar authors and citation using Python}}, + year = {2021}, + doi = {10.5281/zenodo.5764801}, + license = {Unlicense}, + url = {https://github.com/scholarly-python-package/scholarly}, + version = {1.5.1} +} +``` + +## License + +The original code that this project was forked from was released by [Luciano Bello](https://github.com/lbello/chalmers-web) under a [WTFPL](http://www.wtfpl.net/) license. In keeping with this mentality, all code is released under the [Unlicense](http://unlicense.org/). diff --git a/docs/conf.py b/docs/conf.py index fd06d34d..78ed739b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,19 +12,21 @@ # import os import sys + import sphinx_rtd_theme -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- -project = 'scholarlyORG' -copyright = '2021, Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi' -author = 'Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi' +project = "scholarlyORG" +copyright = "2021, Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi" +author = "Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi" # The full version, including alpha/beta/rc tags -release = '1.0b1' -master_doc = 'index' +release = "1.0b1" +master_doc = "index" # -- General configuration --------------------------------------------------- @@ -32,29 +34,29 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'sphinx.ext.doctest', - 'sphinx_rtd_theme', + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.doctest", + "sphinx_rtd_theme", #'sphinx.ext.napoleon' ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- @@ -62,7 +64,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -75,4 +77,4 @@ # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False \ No newline at end of file +todo_include_todos = False diff --git a/scholarly/__init__.py b/scholarly/__init__.py index f339bc94..be80a644 100644 --- a/scholarly/__init__.py +++ b/scholarly/__init__.py @@ -1,4 +1,5 @@ +from ._proxy_generator import DOSException, MaxTriesExceededException, ProxyGenerator from ._scholarly import _Scholarly from .data_types import Author, Publication -from ._proxy_generator import ProxyGenerator, DOSException, MaxTriesExceededException + scholarly = _Scholarly() diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py index ef7c4d21..68e8c2e9 100644 --- a/scholarly/_navigator.py +++ b/scholarly/_navigator.py @@ -1,39 +1,36 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from ._proxy_generator import ProxyGenerator, MaxTriesExceededException, DOSException - -from bs4 import BeautifulSoup - import codecs import logging import random import time + +from bs4 import BeautifulSoup from requests.exceptions import Timeout from selenium.webdriver.common.by import By -from .publication_parser import _SearchScholarIterator + +from ._proxy_generator import DOSException, MaxTriesExceededException, ProxyGenerator from .author_parser import AuthorParser -from .publication_parser import PublicationParser -from .data_types import Author, PublicationSource, ProxyMode +from .data_types import Author, ProxyMode, PublicationSource +from .publication_parser import PublicationParser, _SearchScholarIterator class Singleton(type): + """A single search instance.""" + _instances = {} def __call__(cls, *args, **kwargs): + """A single search instance.""" if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, - **kwargs) + cls._instances[cls] = super().__call__(*args, **kwargs) return cls._instances[cls] -class Navigator(object, metaclass=Singleton): +class Navigator(metaclass=Singleton): """A class used to navigate pages on google scholar.""" def __init__(self): - super(Navigator, self).__init__() - self.logger = logging.getLogger('scholarly') + super().__init__() + self.logger = logging.getLogger("scholarly") self._TIMEOUT = 5 self._max_retries = 5 # A Navigator instance has two proxy managers, each with their session. @@ -45,18 +42,17 @@ def __init__(self): self._session2 = self.pm2.get_session() self.got_403 = False - def set_logger(self, enable: bool): """Enable or disable the logger for google scholar.""" - - self.logger.setLevel((logging.INFO if enable else logging.CRITICAL)) + self.logger.setLevel(logging.INFO if enable else logging.CRITICAL) def set_timeout(self, timeout: int): - """Set timeout period in seconds for scholarly""" + """Set timeout period in seconds for scholarly.""" if timeout >= 0: self._TIMEOUT = timeout def use_proxy(self, pg1: ProxyGenerator, pg2: ProxyGenerator = None): + """Sets up proxy generators.""" if pg1 is not None: self.pm1 = pg1 @@ -66,23 +62,25 @@ def use_proxy(self, pg1: ProxyGenerator, pg2: ProxyGenerator = None): self.pm2 = ProxyGenerator() proxy_works = self.pm2.FreeProxies() if not proxy_works: - self.logger.info("FreeProxy as a secondary proxy is not working. " - "Using the primary proxy for all requests") + self.logger.info( + "FreeProxy as a secondary proxy is not working. " + "Using the primary proxy for all requests" + ) self.pm2 = pg1 self._session1 = self.pm1.get_session() self._session2 = self.pm2.get_session() def _new_session(self, premium=True): + """Creates a new search session.""" self.got_403 = False if premium: self._session1 = self.pm1._new_session() else: self._session2 = self.pm2._new_session() - def _get_page(self, pagerequest: str, premium: bool = False) -> str: - """Return the data from a webpage + """Return the data from a webpage. :param pagerequest: the page url :type pagerequest: str @@ -105,13 +103,13 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: premium = True if pm.proxy_mode is ProxyMode.SCRAPERAPI: self.set_timeout(60) - timeout=self._TIMEOUT + timeout = self._TIMEOUT while tries < self._max_retries: try: - w = random.uniform(1,2) + w = random.uniform(1, 2) time.sleep(w) resp = session.get(pagerequest, timeout=timeout) - self.logger.debug("Session proxy config is {}".format(session.proxies)) + self.logger.debug(f"Session proxy config is {session.proxies}") has_captcha = self._requests_has_captcha(resp.text) @@ -126,48 +124,77 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: if not pm.has_proxy(): self.logger.info("No other connections possible.") if not self.got_403: - self.logger.info("Retrying immediately with another session.") + self.logger.info( + "Retrying immediately with another session." + ) else: - if pm.proxy_mode not in (ProxyMode.LUMINATI, ProxyMode.SCRAPERAPI): - w = random.uniform(60, 2*60) - self.logger.info("Will retry after %.2f seconds (with another session).", w) + if pm.proxy_mode not in ( + ProxyMode.LUMINATI, + ProxyMode.SCRAPERAPI, + ): + w = random.uniform(60, 2 * 60) + self.logger.info( + "Will retry after %.2f seconds (with another session).", + w, + ) time.sleep(w) self._new_session(premium=premium) self.got_403 = True - continue # Retry request within same session + continue # Retry request within same session else: - self.logger.info("We can use another connection... let's try that.") + self.logger.info( + "We can use another connection... let's try that." + ) else: - self.logger.info("""Response code %d. - Retrying...""", resp.status_code) + self.logger.info( + """Response code %d. + Retrying...""", + resp.status_code, + ) except DOSException: if not pm.has_proxy(): self.logger.info("No other connections possible.") - w = random.uniform(60, 2*60) - self.logger.info("Will retry after %.2f seconds (with the same session).", w) + w = random.uniform(60, 2 * 60) + self.logger.info( + "Will retry after %.2f seconds (with the same session).", w + ) time.sleep(w) continue except Timeout as e: - err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args) + err = "Timeout Exception {} while fetching page: {}".format( + type(e).__name__, + e.args, + ) self.logger.info(err) - if timeout < 3*self._TIMEOUT: - self.logger.info("Increasing timeout and retrying within same session.") + if timeout < 3 * self._TIMEOUT: + self.logger.info( + "Increasing timeout and retrying within same session." + ) timeout = timeout + self._TIMEOUT continue self.logger.info("Giving up this session.") except Exception as e: - err = "Exception %s while fetching page: %s" % (type(e).__name__, e.args) + err = "Exception {} while fetching page: {}".format( + type(e).__name__, + e.args, + ) self.logger.info(err) self.logger.info("Retrying with a new session.") tries += 1 try: - session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None)) + session, timeout = pm.get_next_proxy( + num_tries=tries, + old_timeout=timeout, + old_proxy=session.proxies.get("http", None), + ) except Exception: - self.logger.info("No other secondary connections possible. " - "Using the primary proxy for all requests.") + self.logger.info( + "No other secondary connections possible. " + "Using the primary proxy for all requests." + ) break # If secondary proxy does not work, try again primary proxy. @@ -176,13 +203,12 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str: else: raise MaxTriesExceededException("Cannot Fetch from Google Scholar.") - def _set_retries(self, num_retries: int) -> None: - if (num_retries < 0): + """Sets the number of retries allowed per search.""" + if num_retries < 0: raise ValueError("num_retries must not be negative") self._max_retries = num_retries - def _requests_has_captcha(self, text) -> bool: """Tests whether some html text contains a captcha. @@ -192,8 +218,8 @@ def _requests_has_captcha(self, text) -> bool: :rtype: {bool} """ return self._has_captcha( - lambda i : f'id="{i}"' in text, - lambda c : f'class="{c}"' in text, + lambda i: f'id="{i}"' in text, + lambda c: f'class="{c}"' in text, ) def _webdriver_has_captcha(self, premium=True) -> bool: @@ -204,15 +230,15 @@ def _webdriver_has_captcha(self, premium=True) -> bool: """ pm = self.pm1 if premium else self.pm2 return self._has_captcha( - lambda i : len(pm._get_webdriver().find_elements(By.ID, i)) > 0, - lambda c : len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, + lambda i: len(pm._get_webdriver().find_elements(By.ID, i)) > 0, + lambda c: len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, ) def _has_captcha(self, got_id, got_class) -> bool: _CAPTCHA_IDS = [ - "gs_captcha_ccl", # the normal captcha div - "recaptcha", # the form used on full-page captchas - "captcha-form", # another form used on full-page captchas + "gs_captcha_ccl", # the normal captcha div + "recaptcha", # the form used on full-page captchas + "captcha-form", # another form used on full-page captchas ] _DOS_CLASSES = [ "rc-doscaptcha-body", @@ -222,41 +248,40 @@ def _has_captcha(self, got_id, got_class) -> bool: return any([got_id(i) for i in _CAPTCHA_IDS]) def _get_soup(self, url: str) -> BeautifulSoup: - """Return the BeautifulSoup for a page on scholar.google.com""" - html = self._get_page('https://scholar.google.com{0}'.format(url)) - html = html.replace(u'\xa0', u' ') - res = BeautifulSoup(html, 'html.parser') + """Return the BeautifulSoup for a page on scholar.google.com .""" + html = self._get_page(f"https://scholar.google.com{url}") + html = html.replace("\xa0", " ") + res = BeautifulSoup(html, "html.parser") try: - self.publib = res.find('div', id='gs_res_glb').get('data-sva') + self.publib = res.find("div", id="gs_res_glb").get("data-sva") except Exception: pass return res - def search_authors(self, url: str)->Author: - """Generator that returns Author objects from the author search page""" + def search_authors(self, url: str) -> Author: + """Generator that returns Author objects from the author search page.""" soup = self._get_soup(url) author_parser = AuthorParser(self) while True: - rows = soup.find_all('div', 'gsc_1usr') + rows = soup.find_all("div", "gsc_1usr") self.logger.info("Found %d authors", len(rows)) for row in rows: yield author_parser.get_author(row) - cls1 = 'gs_btnPR gs_in_ib gs_btn_half ' - cls2 = 'gs_btn_lsb gs_btn_srt gsc_pgn_pnx' - next_button = soup.find(class_=cls1+cls2) # Can be improved - if next_button and 'disabled' not in next_button.attrs: + cls1 = "gs_btnPR gs_in_ib gs_btn_half " + cls2 = "gs_btn_lsb gs_btn_srt gsc_pgn_pnx" + next_button = soup.find(class_=cls1 + cls2) # Can be improved + if next_button and "disabled" not in next_button.attrs: self.logger.info("Loading next page of authors") - url = next_button['onclick'][17:-1] + url = next_button["onclick"][17:-1] url = codecs.getdecoder("unicode_escape")(url)[0] soup = self._get_soup(url) else: self.logger.info("No more author pages") break - def search_publication(self, url: str, - filled: bool = False) -> PublicationParser: - """Search by scholar query and return a single Publication object + def search_publication(self, url: str, filled: bool = False) -> PublicationParser: + """Search by scholar query and return a single Publication object. :param url: the url to be searched at :type url: str @@ -267,13 +292,16 @@ def search_publication(self, url: str, """ soup = self._get_soup(url) publication_parser = PublicationParser(self) - pub = publication_parser.get_publication(soup.find_all('div', 'gs_or')[0], PublicationSource.PUBLICATION_SEARCH_SNIPPET) + pub = publication_parser.get_publication( + soup.find_all("div", "gs_or")[0], + PublicationSource.PUBLICATION_SEARCH_SNIPPET, + ) if filled: pub = publication_parser.fill(pub) return pub def search_publications(self, url: str) -> _SearchScholarIterator: - """Returns a Publication Generator given a url + """Returns a Publication Generator given a url. :param url: the url where publications can be found. :type url: str @@ -282,8 +310,15 @@ def search_publications(self, url: str) -> _SearchScholarIterator: """ return _SearchScholarIterator(self, url) - def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0) -> Author: - """Search by author ID and return a Author object + def search_author_id( + self, + id: str, + filled: bool = False, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author: + """Search by author ID and return a Author object. + :param id: the Google Scholar id of a particular author :type url: str :param filled: If the returned Author object should be filled @@ -298,30 +333,39 @@ def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby author_parser = AuthorParser(self) res = author_parser.get_author(id) if filled: - res = author_parser.fill(res, sortby=sortby, publication_limit=publication_limit) + res = author_parser.fill( + res, sortby=sortby, publication_limit=publication_limit + ) else: - res = author_parser.fill(res, sections=['basics'], sortby=sortby, publication_limit=publication_limit) + res = author_parser.fill( + res, + sections=["basics"], + sortby=sortby, + publication_limit=publication_limit, + ) return res def search_organization(self, url: str, fromauthor: bool) -> list: """Generate instiution object from author search page. - if no results are found and `fromuthor` is True, then use the first author from the search - to get institution/organization name. + + If no results are found and `fromuthor` is True, then use the first author from the search to get institution/organization name. """ soup = self._get_soup(url) - rows = soup.find_all('h3', 'gsc_inst_res') + rows = soup.find_all("h3", "gsc_inst_res") if rows: self.logger.info("Found institution") res = [] for row in rows: - res.append({'Organization': row.a.text, 'id': row.a['href'].split('org=', 1)[1]}) + res.append( + {"Organization": row.a.text, "id": row.a["href"].split("org=", 1)[1]} + ) if rows == [] and fromauthor is True: try: auth = next(self.search_authors(url)) authorg = self.search_author_id(auth.id).organization - authorg['fromauthor'] = True + authorg["fromauthor"] = True res.append(authorg) except Exception: res = [] diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py index 43cc78df..993253a6 100644 --- a/scholarly/_proxy_generator.py +++ b/scholarly/_proxy_generator.py @@ -1,21 +1,25 @@ -from typing import Callable -from fp.fp import FreeProxy -import random import logging +import random +import tempfile import time +from contextlib import contextmanager +from typing import Callable +from urllib.parse import urlparse + import requests -import tempfile import urllib3 - +from deprecated import deprecated +from fake_useragent import UserAgent +from fp.fp import FreeProxy from selenium import webdriver -from selenium.webdriver.support.wait import WebDriverWait, TimeoutException +from selenium.common.exceptions import ( + UnexpectedAlertPresentException, + WebDriverException, +) from selenium.webdriver.common.by import By -from selenium.common.exceptions import WebDriverException, UnexpectedAlertPresentException from selenium.webdriver.firefox.options import Options as FirefoxOptions -from urllib.parse import urlparse -from fake_useragent import UserAgent -from contextlib import contextmanager -from deprecated import deprecated +from selenium.webdriver.support.wait import TimeoutException, WebDriverWait + try: import stem.process from stem import Signal @@ -31,13 +35,15 @@ class DOSException(Exception): class MaxTriesExceededException(Exception): - """Maximum number of tries by scholarly reached""" + """Maximum number of tries by scholarly reached.""" -class ProxyGenerator(object): +class ProxyGenerator: + """Sets up a proxy to be used.""" + def __init__(self): # setting up logger - self.logger = logging.getLogger('scholarly') + self.logger = logging.getLogger("scholarly") self._proxy_gen = None # If we use a proxy or Tor, we set this to True @@ -54,16 +60,18 @@ def __init__(self): self._new_session() def __del__(self): + """Deletes this proxy session.""" if self._tor_process: self._tor_process.kill() self._tor_process.wait() self._close_session() def get_session(self): + """Returns the proxy session.""" return self._session def Luminati(self, usr, passwd, proxy_port): - """ Setups a luminati proxy without refreshing capabilities. + """Setups a luminati proxy without refreshing capabilities. :param usr: scholarly username, optional by default None :type usr: string @@ -78,12 +86,14 @@ def Luminati(self, usr, passwd, proxy_port): >>> pg = ProxyGenerator() >>> success = pg.Luminati(usr = foo, passwd = bar, port = 1200) """ - if (usr is not None and passwd is not None and proxy_port is not None): + if usr is not None and passwd is not None and proxy_port is not None: username = usr password = passwd port = proxy_port else: - self.logger.warning("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.") + self.logger.warning( + "Not enough parameters were provided for the Luminati proxy. Reverting to a local connection." + ) return session_id = random.random() proxy = f"http://{username}-session-{session_id}:{password}@zproxy.lum-superproxy.io:{port}" @@ -96,12 +106,11 @@ def Luminati(self, usr, passwd, proxy_port): return proxy_works def SingleProxy(self, http=None, https=None): - """ - Use proxy of your choice + """Use proxy of your choice. :param http: http proxy address :type http: string - :param https: https proxy adress + :param https: https proxy address :type https: string :returns: whether or not the proxy was set up successfully :rtype: {bool} @@ -109,7 +118,7 @@ def SingleProxy(self, http=None, https=None): :Example:: >>> pg = ProxyGenerator() - >>> success = pg.SingleProxy(http = , https = ) + >>> success = pg.SingleProxy(http = , https = ) """ self.logger.info("Enabling proxies: http=%s https=%s", http, https) proxy_works = self._use_proxy(http=http, https=https) @@ -117,11 +126,16 @@ def SingleProxy(self, http=None, https=None): self.proxy_mode = ProxyMode.SINGLEPROXY self.logger.info("Proxy setup successfully") else: - self.logger.warning("Unable to setup the proxy: http=%s https=%s. Reason unknown." , http, https) + self.logger.warning( + "Unable to setup the proxy: http=%s https=%s. Reason unknown.", + http, + https, + ) return proxy_works def _check_proxy(self, proxies) -> bool: """Checks if a proxy is working. + :param proxies: A dictionary {'http': url1, 'https': url1} with the urls of the proxies :returns: whether the proxy is working or not @@ -132,8 +146,9 @@ def _check_proxy(self, proxies) -> bool: try: resp = session.get("http://httpbin.org/ip", timeout=self._TIMEOUT) if resp.status_code == 200: - self.logger.info("Proxy works! IP address: %s", - resp.json()["origin"]) + self.logger.info( + "Proxy works! IP address: %s", resp.json()["origin"] + ) return True elif resp.status_code == 401: self.logger.warning("Incorrect credentials for proxy!") @@ -143,17 +158,23 @@ def _check_proxy(self, proxies) -> bool: except Exception as e: # Failure is common and expected with free proxy. # Do not log at warning level and annoy users. - level = logging.DEBUG if self.proxy_mode is ProxyMode.FREE_PROXIES else logging.WARNING + level = ( + logging.DEBUG + if self.proxy_mode is ProxyMode.FREE_PROXIES + else logging.WARNING + ) self.logger.log(level, "Exception while testing proxy: %s", e) if self.proxy_mode in (ProxyMode.LUMINATI, ProxyMode.SCRAPERAPI): - self.logger.warning("Double check your credentials and try increasing the timeout") + self.logger.warning( + "Double check your credentials and try increasing the timeout" + ) return False def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool: """Refreshes the id by using a new Tor node. - :returns: Whether or not the refresh was succesful + :returns: Whether or not the refresh was successful :rtype: {bool} """ try: @@ -171,8 +192,7 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool: return (False, None) def _use_proxy(self, http: str, https: str = None) -> bool: - """Allows user to set their own proxy for the connection session. - Sets the proxy if it works. + """Allows user to set their own proxy for the connection session. Sets the proxy if it works. :param http: the http proxy :type http: str @@ -184,16 +204,21 @@ def _use_proxy(self, http: str, https: str = None) -> bool: if https is None: https = http - proxies = {'http': http, 'https': https} + proxies = {"http": http, "https": https} if self.proxy_mode == ProxyMode.SCRAPERAPI: - r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json() + r = requests.get( + "http://api.scraperapi.com/account", params={"api_key": self._API_KEY} + ).json() if "error" in r: self.logger.warning(r["error"]) self._proxy_works = False else: self._proxy_works = r["requestCount"] < int(r["requestLimit"]) - self.logger.info("Successful ScraperAPI requests %d / %d", - r["requestCount"], r["requestLimit"]) + self.logger.info( + "Successful ScraperAPI requests %d / %d", + r["requestCount"], + r["requestLimit"], + ) else: self._proxy_works = self._check_proxy(proxies) @@ -203,10 +228,13 @@ def _use_proxy(self, http: str, https: str = None) -> bool: return self._proxy_works - @deprecated(version='1.5', reason="Tor methods are deprecated and are not actively tested.") - def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: str): - """ - Setting up Tor Proxy. A tor service should be already running on the system. Otherwise you might want to use Tor_Internal + @deprecated( + version="1.5", reason="Tor methods are deprecated and are not actively tested." + ) + def Tor_External( + self, tor_sock_port: int, tor_control_port: int, tor_password: str + ): + """Setting up Tor Proxy. A tor service should be already running on the system. Otherwise you might want to use Tor_Internal. :param tor_sock_port: the port where the Tor sock proxy is running :type tor_sock_port: int @@ -222,8 +250,10 @@ def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: Note: This method is deprecated since v1.5 """ if stem is None: - raise RuntimeError("Tor methods are not supported with basic version of the package. " - "Please install scholarly[tor] to use this method.") + raise RuntimeError( + "Tor methods are not supported with basic version of the package. " + "Please install scholarly[tor] to use this method." + ) self._TIMEOUT = 10 @@ -245,14 +275,16 @@ def Tor_External(self, tor_sock_port: int, tor_control_port: int, tor_password: "proxy_works": self._proxy_works, "refresh_works": self._can_refresh_tor, "tor_control_port": tor_control_port, - "tor_sock_port": tor_sock_port + "tor_sock_port": tor_sock_port, } - @deprecated(version='1.5', reason="Tor methods are deprecated and are not actively tested") + @deprecated( + version="1.5", reason="Tor methods are deprecated and are not actively tested" + ) def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): - ''' - Starts a Tor client running in a scholarly-specific port, together with a scholarly-specific control port. - If no arguments are passed for the tor_sock_port and the tor_control_port they are automatically generated in the following ranges + """Starts a Tor client running in a scholarly-specific port, together with a scholarly-specific control port. + + If no arguments are passed for the tor_sock_port and the tor_control_port they are automatically generated in the following ranges: - tor_sock_port: (9000, 9500) - tor_control_port: (9500, 9999) @@ -268,20 +300,24 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): pg.Tor_Internal(tor_cmd = 'tor') Note: This method is deprecated since v1.5 - ''' + """ if stem is None: - raise RuntimeError("Tor methods are not supported with basic version of the package. " - "Please install scholarly[tor] to use this method.") + raise RuntimeError( + "Tor methods are not supported with basic version of the package. " + "Please install scholarly[tor] to use this method." + ) self.logger.info("Attempting to start owned Tor as the proxy") if tor_cmd is None: - self.logger.info("No tor_cmd argument passed. This should point to the location of Tor executable.") + self.logger.info( + "No tor_cmd argument passed. This should point to the location of Tor executable." + ) return { "proxy_works": False, "refresh_works": False, "tor_control_port": None, - "tor_sock_port": None + "tor_sock_port": None, } if tor_sock_port is None: @@ -298,9 +334,9 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): self._tor_process = stem.process.launch_tor_with_config( tor_cmd=tor_cmd, config={ - 'ControlPort': str(tor_control_port), - 'SocksPort': str(tor_sock_port), - 'DataDirectory': tempfile.mkdtemp() + "ControlPort": str(tor_control_port), + "SocksPort": str(tor_sock_port), + "DataDirectory": tempfile.mkdtemp() # TODO Perhaps we want to also set a password here }, # take_ownership=True # Taking this out for now, as it seems to cause trouble @@ -310,9 +346,9 @@ def Tor_Internal(self, tor_cmd=None, tor_sock_port=None, tor_control_port=None): def _has_captcha(self, got_id, got_class) -> bool: _CAPTCHA_IDS = [ - "gs_captcha_ccl", # the normal captcha div - "recaptcha", # the form used on full-page captchas - "captcha-form", # another form used on full-page captchas + "gs_captcha_ccl", # the normal captcha div + "recaptcha", # the form used on full-page captchas + "captcha-form", # another form used on full-page captchas ] _DOS_CLASSES = [ "rc-doscaptcha-body", @@ -328,8 +364,8 @@ def _webdriver_has_captcha(self) -> bool: :rtype: {bool} """ return self._has_captcha( - lambda i : len(self._get_webdriver().find_elements(By.ID, i)) > 0, - lambda c : len(self._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, + lambda i: len(self._get_webdriver().find_elements(By.ID, i)) > 0, + lambda c: len(self._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0, ) def _get_webdriver(self): @@ -352,32 +388,36 @@ def _get_webdriver(self): def _get_chrome_webdriver(self): if self._proxy_works: - webdriver.DesiredCapabilities.CHROME['proxy'] = { - "httpProxy": self._session.proxies['http'], - "sslProxy": self._session.proxies['https'], - "proxyType": "MANUAL" + webdriver.DesiredCapabilities.CHROME["proxy"] = { + "httpProxy": self._session.proxies["http"], + "sslProxy": self._session.proxies["https"], + "proxyType": "MANUAL", } options = webdriver.ChromeOptions() - options.add_argument('--headless') - self._webdriver = webdriver.Chrome('chromedriver', options=options) - self._webdriver.get("https://scholar.google.com") # Need to pre-load to set cookies later + options.add_argument("--headless") + self._webdriver = webdriver.Chrome("chromedriver", options=options) + self._webdriver.get( + "https://scholar.google.com" + ) # Need to pre-load to set cookies later return self._webdriver def _get_firefox_webdriver(self): if self._proxy_works: # Redirect webdriver through proxy - webdriver.DesiredCapabilities.FIREFOX['proxy'] = { - "httpProxy": self._session.proxies['http'], - "sslProxy": self._session.proxies['https'], + webdriver.DesiredCapabilities.FIREFOX["proxy"] = { + "httpProxy": self._session.proxies["http"], + "sslProxy": self._session.proxies["https"], "proxyType": "MANUAL", } options = FirefoxOptions() - options.add_argument('--headless') + options.add_argument("--headless") self._webdriver = webdriver.Firefox(options=options) - self._webdriver.get("https://scholar.google.com") # Need to pre-load to set cookies later + self._webdriver.get( + "https://scholar.google.com" + ) # Need to pre-load to set cookies later # It might make sense to (pre)set cookies as well, e.g., to set a GSP ID. # However, a limitation of webdriver makes it impossible to set cookies for @@ -392,42 +432,54 @@ def _handle_captcha2(self, url): cur_host = urlparse(self._get_webdriver().current_url).hostname for cookie in self._session.cookies: # Only set cookies matching the current domain, cf. https://github.com/w3c/webdriver/issues/1238 - if cur_host is cookie.domain.lstrip('.'): - self._get_webdriver().add_cookie({ - 'name': cookie.name, - 'value': cookie.value, - 'path': cookie.path, - 'domain':cookie.domain, - }) + if cur_host is cookie.domain.lstrip("."): + self._get_webdriver().add_cookie( + { + "name": cookie.name, + "value": cookie.value, + "path": cookie.path, + "domain": cookie.domain, + } + ) self._get_webdriver().get(url) log_interval = 10 cur = 0 - timeout = 60*60*24*7 # 1 week + timeout = 60 * 60 * 24 * 7 # 1 week while cur < timeout: try: - cur = cur + log_interval # Update before exceptions can happen - WebDriverWait(self._get_webdriver(), log_interval).until_not(lambda drv : self._webdriver_has_captcha()) + cur = cur + log_interval # Update before exceptions can happen + WebDriverWait(self._get_webdriver(), log_interval).until_not( + lambda drv: self._webdriver_has_captcha() + ) break except TimeoutException: - self.logger.info(f"Solving the captcha took already {cur} seconds (of maximum {timeout} s).") + self.logger.info( + f"Solving the captcha took already {cur} seconds (of maximum {timeout} s)." + ) except UnexpectedAlertPresentException as e: # This can apparently happen when reCAPTCHA has hiccups: # "Cannot contact reCAPTCHA. Check your connection and try again." - self.logger.info(f"Unexpected alert while waiting for captcha completion: {e.args}") + self.logger.info( + f"Unexpected alert while waiting for captcha completion: {e.args}" + ) time.sleep(15) except DOSException as e: self.logger.info("Google thinks we are DOSing the captcha.") raise e except (WebDriverException) as e: - self.logger.info("Browser seems to be disfunctional - closed by user?") + self.logger.info("Browser seems to be dysfunctional - closed by user?") raise e except Exception as e: # TODO: This exception handler should eventually be removed when # we know the "typical" (non-error) exceptions that can occur. - self.logger.info(f"Unhandled {type(e).__name__} while waiting for captcha completion: {e.args}") + self.logger.info( + f"Unhandled {type(e).__name__} while waiting for captcha completion: {e.args}" + ) else: - raise TimeoutException(f"Could not solve captcha in time (within {timeout} s).") + raise TimeoutException( + f"Could not solve captcha in time (within {timeout} s)." + ) self.logger.info(f"Solved captcha in less than {cur} seconds.") for cookie in self._get_webdriver().get_cookies(): @@ -446,11 +498,11 @@ def _new_session(self): self.got_403 = False # Suppress the misleading traceback from UserAgent() - with self._suppress_logger('fake_useragent'): + with self._suppress_logger("fake_useragent"): _HEADERS = { - 'accept-language': 'en-US,en', - 'accept': 'text/html,application/xhtml+xml,application/xml', - 'User-Agent': UserAgent().random, + "accept-language": "en-US,en", + "accept": "text/html,application/xhtml+xml,application/xml", + "User-Agent": UserAgent().random, } self._session.headers.update(_HEADERS) @@ -475,35 +527,34 @@ def _close_session(self): self.logger.warning("Could not close webdriver cleanly: %s", e) def _fp_coroutine(self, timeout=1, wait_time=120): - """A coroutine to continuosly yield free proxies + """A coroutine to continuously yield free proxies. It takes back the proxies that stopped working and marks it as dirty. """ freeproxy = FreeProxy(rand=False, timeout=timeout) - if not hasattr(self, '_dirty_freeproxies'): + if not hasattr(self, "_dirty_freeproxies"): self._dirty_freeproxies = set() all_proxies = freeproxy.get_proxy_list() all_proxies.reverse() # Try the older proxies first t1 = time.time() - while (time.time()-t1 < wait_time): + while time.time() - t1 < wait_time: proxy = all_proxies.pop() if not all_proxies: all_proxies = freeproxy.get_proxy_list() if proxy in self._dirty_freeproxies: continue - proxies = {'http': proxy, 'https': proxy} + proxies = {"http": proxy, "https": proxy} proxy_works = self._check_proxy(proxies) if proxy_works: - dirty_proxy = (yield proxy) + dirty_proxy = yield proxy t1 = time.time() else: dirty_proxy = proxy self._dirty_freeproxies.add(dirty_proxy) def FreeProxies(self, timeout=1, wait_time=120): - """ - Sets up continuously rotating proxies from the free-proxy library + """Sets up continuously rotating proxies from the free-proxy library. :param timeout: Timeout for a single proxy in seconds, optional :type timeout: float @@ -537,16 +588,16 @@ def FreeProxies(self, timeout=1, wait_time=120): if n_tries == n_retries: n_dirty = len(self._dirty_freeproxies) self._fp_gen.close() - msg = ("None of the free proxies are working at the moment. " - f"Marked {n_dirty} proxies dirty. Try again after a few minutes." - ) + msg = ( + "None of the free proxies are working at the moment. " + f"Marked {n_dirty} proxies dirty. Try again after a few minutes." + ) raise MaxTriesExceededException(msg) else: return True def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): - """ - Sets up a proxy using ScraperAPI + """Sets up a proxy using ScraperAPI. The optional parameters are only for Business and Enterprise plans with ScraperAPI. For more details, https://www.scraperapi.com/documentation/ @@ -567,7 +618,9 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): raise ValueError("ScraperAPI API Key is required.") # Get basic account information. This will NOT be counted towards successful API requests. - r = requests.get("http://api.scraperapi.com/account", params={'api_key': API_KEY}).json() + r = requests.get( + "http://api.scraperapi.com/account", params={"api_key": API_KEY} + ).json() if "error" in r: self.logger.warning(r["error"]) return False @@ -576,8 +629,11 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): self.proxy_mode = ProxyMode.SCRAPERAPI r["requestLimit"] = int(r["requestLimit"]) - self.logger.info("Successful ScraperAPI requests %d / %d", - r["requestCount"], r["requestLimit"]) + self.logger.info( + "Successful ScraperAPI requests %d / %d", + r["requestCount"], + r["requestLimit"], + ) # ScraperAPI documentation recommends setting the timeout to 60 seconds # so it has had a chance to try out all the retries. @@ -596,13 +652,15 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) for _ in range(3): - proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001') + proxy_works = self._use_proxy( + http=f"{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001" + ) if proxy_works: self.logger.info("ScraperAPI proxy setup successfully") self._session.verify = False return proxy_works - if (r["requestCount"] >= r["requestLimit"]): + if r["requestCount"] >= r["requestLimit"]: self.logger.warning("ScraperAPI account limit reached.") else: self.logger.warning("ScraperAPI does not seem to work. Reason unknown.") @@ -610,28 +668,30 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False): return False def has_proxy(self) -> bool: + """Checks if a proxy is available.""" return self._proxy_gen or self._can_refresh_tor def _set_proxy_generator(self, gen: Callable[..., str]) -> bool: self._proxy_gen = gen return True - def get_next_proxy(self, num_tries = None, old_timeout = 3, old_proxy=None): + def get_next_proxy(self, num_tries=None, old_timeout=3, old_proxy=None): + """Tries getting the next proxy.""" new_timeout = old_timeout if self._can_refresh_tor: # Check if Tor is running and refresh it self.logger.info("Refreshing Tor ID...") self._refresh_tor_id(self._tor_control_port, self._tor_password) - time.sleep(5) # wait for the refresh to happen - new_timeout = self._TIMEOUT # Reset timeout to default + time.sleep(5) # wait for the refresh to happen + new_timeout = self._TIMEOUT # Reset timeout to default elif self._proxy_gen: - if (num_tries): + if num_tries: self.logger.info("Try #%d failed. Switching proxy.", num_tries) # Try to get another proxy new_proxy = self._proxy_gen(old_proxy) - while (not self._use_proxy(new_proxy)): + while not self._use_proxy(new_proxy): new_proxy = self._proxy_gen(new_proxy) - new_timeout = self._TIMEOUT # Reset timeout to default + new_timeout = self._TIMEOUT # Reset timeout to default self._new_session() else: self._new_session() @@ -643,8 +703,7 @@ def get_next_proxy(self, num_tries = None, old_timeout = 3, old_proxy=None): @staticmethod @contextmanager def _suppress_logger(loggerName: str, level=logging.CRITICAL): - """Temporarily suppress logging output from a specific logger. - """ + """Temporarily suppress logging output from a specific logger.""" logger = logging.getLogger(loggerName) original_level = logger.getEffectiveLevel() logger.setLevel(level) diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py index faf6ab0a..310bf5fc 100644 --- a/scholarly/_scholarly.py +++ b/scholarly/_scholarly.py @@ -1,28 +1,31 @@ -"""scholarly.py""" -import requests -import os import copy import csv +import os import pprint from typing import Dict, List + +import requests +from dotenv import find_dotenv, load_dotenv + from ._navigator import Navigator from ._proxy_generator import ProxyGenerator -from dotenv import find_dotenv, load_dotenv from .author_parser import AuthorParser -from .publication_parser import PublicationParser, _SearchScholarIterator from .data_types import Author, AuthorSource, Journal, Publication, PublicationSource +from .publication_parser import PublicationParser, _SearchScholarIterator -_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}' -_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}' -_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}' -_PUBSEARCH = '/scholar?hl=en&q={0}' -_CITEDBYSEARCH = '/scholar?hl=en&cites={0}' +_AUTHSEARCH = "/citations?hl=en&view_op=search_authors&mauthors={0}" +_KEYWORDSEARCH = "/citations?hl=en&view_op=search_authors&mauthors=label:{0}" +_KEYWORDSEARCHBASE = "/citations?hl=en&view_op=search_authors&mauthors={}" +_PUBSEARCH = "/scholar?hl=en&q={0}" +_CITEDBYSEARCH = "/scholar?hl=en&cites={0}" _ORGSEARCH = "/citations?view_op=view_org&hl=en&org={0}" -_MANDATES_URL = "https://scholar.google.com/citations?view_op=mandates_leaderboard_csv&hl=en" +_MANDATES_URL = ( + "https://scholar.google.com/citations?view_op=mandates_leaderboard_csv&hl=en" +) class _Scholarly: - """Class that manages the API for scholarly""" + """Class that manages the API for scholarly.""" def __init__(self): load_dotenv(find_dotenv()) @@ -37,17 +40,19 @@ def journal_categories(self): self._journal_categories = self.get_journal_categories() return self._journal_categories - def set_retries(self, num_retries: int)->None: - """Sets the number of retries in case of errors + def set_retries(self, num_retries: int) -> None: + """Sets the number of retries in case of errors. :param num_retries: the number of retries :type num_retries: int """ - return self.__nav._set_retries(num_retries) - def use_proxy(self, proxy_generator: ProxyGenerator, - secondary_proxy_generator: ProxyGenerator = None) -> None: + def use_proxy( + self, + proxy_generator: ProxyGenerator, + secondary_proxy_generator: ProxyGenerator = None, + ) -> None: """Select which proxy method to use. See the available ProxyGenerator methods. @@ -73,24 +78,26 @@ def use_proxy(self, proxy_generator: ProxyGenerator, """ self.__nav.use_proxy(proxy_generator, secondary_proxy_generator) - def set_logger(self, enable: bool): - """Enable or disable the logger for google scholar. - Enabled by default - """ + """Enable or disable the logger for google scholar. Enabled by default.""" self.__nav.set_logger(enable) def set_timeout(self, timeout: int): - """Set timeout period in seconds for scholarly""" + """Set timeout period in seconds for scholarly.""" self.__nav.set_timeout(timeout) - def search_pubs(self, - query: str, patents: bool = True, - citations: bool = True, year_low: int = None, - year_high: int = None, sort_by: str = "relevance", - include_last_year: str = "abstracts", - start_index: int = 0)->_SearchScholarIterator: - """Searches by query and returns a generator of Publication objects + def search_pubs( + self, + query: str, + patents: bool = True, + citations: bool = True, + year_low: int = None, + year_high: int = None, + sort_by: str = "relevance", + include_last_year: str = "abstracts", + start_index: int = 0, + ) -> _SearchScholarIterator: + """Searches by query and returns a generator of Publication objects. :param query: terms to be searched :type query: str @@ -150,9 +157,16 @@ def search_pubs(self, 'url_scholarbib': '/scholar?q=info:K8ZpoI6hZNoJ:scholar.google.com/&output=cite&scirp=0&hl=en'} """ - url = self._construct_url(_PUBSEARCH.format(requests.utils.quote(query)), patents=patents, - citations=citations, year_low=year_low, year_high=year_high, - sort_by=sort_by, include_last_year=include_last_year, start_index=start_index) + url = self._construct_url( + _PUBSEARCH.format(requests.utils.quote(query)), + patents=patents, + citations=citations, + year_low=year_low, + year_high=year_high, + sort_by=sort_by, + include_last_year=include_last_year, + start_index=start_index, + ) return self.__nav.search_publications(url) def search_citedby(self, publication_id: int, **kwargs): @@ -166,8 +180,10 @@ def search_citedby(self, publication_id: int, **kwargs): url = self._construct_url(_CITEDBYSEARCH.format(str(publication_id)), **kwargs) return self.__nav.search_publications(url) - def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationParser: - """Search by scholar query and return a single Publication container object + def search_single_pub( + self, pub_title: str, filled: bool = False + ) -> PublicationParser: + """Search by scholar query and return a single Publication container object. :param pub_title: Title of the publication to search :type pub_title: string @@ -178,7 +194,7 @@ def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationPa return self.__nav.search_publication(url, filled) def search_author(self, name: str): - """Search by author name and return a generator of Author objects + """Search by author name and return a generator of Author objects. :Example:: @@ -204,8 +220,15 @@ def search_author(self, name: str): url = _AUTHSEARCH.format(requests.utils.quote(name)) return self.__nav.search_authors(url) - def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_limit: int = 0) -> Author or Publication: + def fill( + self, + object: dict, + sections=[], + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author or Publication: """Fills the object according to its type. + If the container type is Author it will fill the additional author fields If it is Publication it will fill it accordingly. @@ -223,47 +246,51 @@ def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_l If 'public_access' is filled along with 'publications' or afterwards for the first time, the publication entries are also marked whether they satisfy public access mandates or not. """ - - if object['container_type'] == "Author": + if object["container_type"] == "Author": author_parser = AuthorParser(self.__nav) object = author_parser.fill(object, sections, sortby, publication_limit) if object is False: raise ValueError("Incorrect input") - elif object['container_type'] == "Publication": + elif object["container_type"] == "Publication": publication_parser = PublicationParser(self.__nav) object = publication_parser.fill(object) return object - def bibtex(self, object: Publication)->str: - """Returns a bibtex entry for a publication that has either Scholar source - or citation source + def bibtex(self, object: Publication) -> str: + """Returns a bibtex entry for a publication that has either Scholar source or citation source. :param object: The Publication object for the bibtex exportation :type object: Publication """ - if object['container_type'] == "Publication": + if object["container_type"] == "Publication": publication_parser = PublicationParser(self.__nav) return publication_parser.bibtex(object) else: self.logger.warning("Object not supported for bibtex exportation") return - def citedby(self, object: Publication)->_SearchScholarIterator: - """Searches Google Scholar for other articles that cite this Publication - and returns a Publication generator. + def citedby(self, object: Publication) -> _SearchScholarIterator: + """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. :param object: The Publication object for the bibtex exportation :type object: Publication """ - if object['container_type'] == "Publication": + if object["container_type"] == "Publication": publication_parser = PublicationParser(self.__nav) return publication_parser.citedby(object) else: self.logger.warning("Object not supported for bibtex exportation") return - def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author: - """Search by author id and return a single Author object + def search_author_id( + self, + id: str, + filled: bool = False, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author: + """Search by author id and return a single Author object. + :param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. :type sortby: string :param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit. @@ -292,7 +319,7 @@ def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby return self.__nav.search_author_id(id, filled, sortby, publication_limit) def search_keyword(self, keyword: str): - """Search by keyword and return a generator of Author objects + """Search by keyword and return a generator of Author objects. :param keyword: keyword to be searched :type keyword: str @@ -325,7 +352,7 @@ def search_keyword(self, keyword: str): return self.__nav.search_authors(url) def search_keywords(self, keywords: List[str]): - """Search by keywords and return a generator of Author objects + """Search by keywords and return a generator of Author objects. :param keywords: a list of keywords to be searched :type keyword: List[str] @@ -355,90 +382,89 @@ def search_keywords(self, keywords: List[str]): 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'} """ - - formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in keywords] - formated_keywords = '+'.join(formated_keywords) + formated_keywords = [ + "label:" + requests.utils.quote(keyword) for keyword in keywords + ] + formated_keywords = "+".join(formated_keywords) url = _KEYWORDSEARCHBASE.format(formated_keywords) return self.__nav.search_authors(url) - def search_pubs_custom_url(self, url: str)->_SearchScholarIterator: - """Search by custom URL and return a generator of Publication objects - URL should be of the form '/scholar?q=...' + def search_pubs_custom_url(self, url: str) -> _SearchScholarIterator: + """Search by custom URL and return a generator of Publication objects URL should be of the form '/scholar?q=...'. A typical use case is to generate the URL by first typing in search parameters in the Advanced Search dialog box and then use the URL here to programmatically fetch the results. - :param url: custom url to seach for the publication + :param url: custom url to search for the publication :type url: string """ return self.__nav.search_publications(url) - def search_author_custom_url(self, url: str)->Author: - """Search by custom URL and return a generator of Author objects - URL should be of the form '/citation?q=...' + def search_author_custom_url(self, url: str) -> Author: + """Search by custom URL and return a generator of Author objects URL should be of the form '/citation?q=...'. :param url: url for the custom author url :type url: string """ return self.__nav.search_authors(url) - def get_related_articles(self, object: Publication)->_SearchScholarIterator: - """ - Search google scholar for related articles to a specific publication. + def get_related_articles(self, object: Publication) -> _SearchScholarIterator: + """Search google scholar for related articles to a specific publication. :param object: Publication object used to get the related articles :type object: Publication """ - if object['container_type'] != 'Publication': + if object["container_type"] != "Publication": self.logger.warning("Not a publication object") return - if object['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: - if 'url_related_articles' not in object.keys(): + if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + if "url_related_articles" not in object.keys(): object = self.fill(object) - return self.__nav.search_publications(object['url_related_articles']) - elif object['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: - return self.__nav.search_publications(object['url_related_articles']) + return self.__nav.search_publications(object["url_related_articles"]) + elif object["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + return self.__nav.search_publications(object["url_related_articles"]) - def pprint(self, object: Author or Publication)->None: - """Pretty print an Author or Publication container object + def pprint(self, object: Author or Publication) -> None: + """Pretty print an Author or Publication container object. :param object: Publication or Author container object :type object: Author or Publication """ - if 'container_type' not in object: + if "container_type" not in object: self.logger.warning("Not a scholarly container object") return to_print = copy.deepcopy(object) - if to_print['container_type'] == 'Publication': - to_print['source'] = PublicationSource(to_print['source']).name - elif to_print['container_type'] == 'Author': + if to_print["container_type"] == "Publication": + to_print["source"] = PublicationSource(to_print["source"]).name + elif to_print["container_type"] == "Author": parser = AuthorParser(self.__nav) - to_print['source'] = AuthorSource(to_print['source']).name - if parser._sections == to_print['filled']: - to_print['filled'] = True + to_print["source"] = AuthorSource(to_print["source"]).name + if parser._sections == to_print["filled"]: + to_print["filled"] = True else: - to_print['filled'] = False - - if 'coauthors' in to_print: - for coauthor in to_print['coauthors']: - coauthor['filled'] = False - del coauthor['container_type'] - coauthor['source'] = AuthorSource(coauthor['source']).name - - if 'publications' in to_print: - for publication in to_print['publications']: - publication['source'] = PublicationSource(publication['source']).name - del publication['container_type'] - - del to_print['container_type'] + to_print["filled"] = False + + if "coauthors" in to_print: + for coauthor in to_print["coauthors"]: + coauthor["filled"] = False + del coauthor["container_type"] + coauthor["source"] = AuthorSource(coauthor["source"]).name + + if "publications" in to_print: + for publication in to_print["publications"]: + publication["source"] = PublicationSource( + publication["source"] + ).name + del publication["container_type"] + + del to_print["container_type"] print(pprint.pformat(to_print).encode("utf-8")) def search_org(self, name: str, fromauthor: bool = False) -> list: - """ - Search by organization name and return a list of possible disambiguations + """Search by organization name and return a list of possible disambiguations. :Example:: .. testcode:: @@ -453,13 +479,11 @@ def search_org(self, name: str, fromauthor: bool = False) -> list: 'id': '9670678584336165373'} ] """ - url = _AUTHSEARCH.format(requests.utils.quote(name)) return self.__nav.search_organization(url, fromauthor) def search_author_by_organization(self, organization_id: int): - """ - Search for authors in an organization and return a generator of Authors + """Search for authors in an organization and return a generator of Authors. ``organization_id`` can be found from the organization name using ``search_org``. Alternatively, they can be found in the ``Author`` object. @@ -474,15 +498,16 @@ def search_author_by_organization(self, organization_id: int): url = _ORGSEARCH.format(organization_id) return self.__nav.search_authors(url) - def download_mandates_csv(self, filename: str, overwrite: bool = False, - include_links: bool =True): - """ - Download the CSV file of the current mandates. - """ + def download_mandates_csv( + self, filename: str, overwrite: bool = False, include_links: bool = True + ): + """Download the CSV file of the current mandates.""" if (not overwrite) and os.path.exists(filename): - raise ValueError(f"{filename} already exists. Either provide a " - "different filename or allow overwriting by " - "setting overwrite=True") + raise ValueError( + f"{filename} already exists. Either provide a " + "different filename or allow overwriting by " + "setting overwrite=True" + ) text = self.__nav._get_page(_MANDATES_URL, premium=False) if include_links: soup = self.__nav._get_soup("/citations?hl=en&view_op=mandates_leaderboard") @@ -491,7 +516,7 @@ def download_mandates_csv(self, filename: str, overwrite: bool = False, cached = agency.find("span", class_="gs_a").a["href"] name = agency.a.text if name != "cached": - policy = agency.a['href'] + policy = agency.a["href"] else: name = agency.text[:-10] policy = "" @@ -501,71 +526,83 @@ def download_mandates_csv(self, filename: str, overwrite: bool = False, else: text = text.replace(f"{name},", f"{name},{policy},{cached},") try: - with open(filename, 'w') as f: + with open(filename, "w") as f: f.write(text) - except IOError: + except OSError: self.logger.error("Error writing mandates as %s", filename) finally: return text # TODO: Make it a public method in v1.6 - def _construct_url(self, baseurl: str, patents: bool = True, - citations: bool = True, year_low: int = None, - year_high: int = None, sort_by: str = "relevance", - include_last_year: str = "abstracts", - start_index: int = 0)-> str: + def _construct_url( + self, + baseurl: str, + patents: bool = True, + citations: bool = True, + year_low: int = None, + year_high: int = None, + sort_by: str = "relevance", + include_last_year: str = "abstracts", + start_index: int = 0, + ) -> str: """Construct URL from requested parameters.""" url = baseurl - yr_lo = '&as_ylo={0}'.format(year_low) if year_low is not None else '' - yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else '' - citations = '&as_vis={0}'.format(1 - int(citations)) - patents = '&as_sdt={0},33'.format(1 - int(patents)) - sortby = '' - start = '&start={0}'.format(start_index) if start_index > 0 else '' + yr_lo = f"&as_ylo={year_low}" if year_low is not None else "" + yr_hi = f"&as_yhi={year_high}" if year_high is not None else "" + citations = f"&as_vis={1 - int(citations)}" + patents = f"&as_sdt={1 - int(patents)},33" + sortby = "" + start = f"&start={start_index}" if start_index > 0 else "" if sort_by == "date": if include_last_year == "abstracts": - sortby = '&scisbd=1' + sortby = "&scisbd=1" elif include_last_year == "everything": - sortby = '&scisbd=2' + sortby = "&scisbd=2" else: - self.logger.debug("Invalid option for 'include_last_year', available options: 'everything', 'abstracts'") + self.logger.debug( + "Invalid option for 'include_last_year', available options: 'everything', 'abstracts'" + ) return elif sort_by != "relevance": - self.logger.debug("Invalid option for 'sort_by', available options: 'relevance', 'date'") + self.logger.debug( + "Invalid option for 'sort_by', available options: 'relevance', 'date'" + ) return # improve str below return url + yr_lo + yr_hi + citations + patents + sortby + start def get_journal_categories(self): - """ - Get a dict of journal categories and subcategories. - """ + """Get a dict of journal categories and subcategories.""" soup = self.__nav._get_soup("/citations?view_op=top_venues&hl=en&vq=en") categories = {} for category in soup.find_all("a", class_="gs_md_li"): - if not "vq=" in category['href']: + if not "vq=" in category["href"]: continue - vq = category['href'].split("&vq=")[1] + vq = category["href"].split("&vq=")[1] categories[category.text] = {} categories[category.text][None] = vq for category in categories: vq = categories[category][None] - if vq=="en": + if vq == "en": continue soup = self.__nav._get_soup(f"/citations?view_op=top_venues&hl=en&vq={vq}") for subcategory in soup.find_all("a", class_="gs_md_li"): - if not f"&vq={vq}_" in subcategory['href']: + if not f"&vq={vq}_" in subcategory["href"]: continue - categories[category][subcategory.text] = subcategory['href'].split("&vq=")[1] + categories[category][subcategory.text] = subcategory["href"].split( + "&vq=" + )[1] - #print(categories) + # print(categories) return categories - def get_journals(self, category='English', subcategory=None, include_comments: bool = False) -> Dict[int, Journal]: + def get_journals( + self, category="English", subcategory=None, include_comments: bool = False + ) -> Dict[int, Journal]: try: cat = self.journal_categories[category] try: @@ -578,46 +615,63 @@ def get_journals(self, category='English', subcategory=None, include_comments: b h5indices = soup.find_all("a", class_="gs_ibl gsc_mp_anchor") h5medians = soup.find_all("span", class_="gs_ibl") - - #import pdb; pdb.set_trace() + # import pdb; pdb.set_trace() result = {} - for rank, name, h5index, h5median in zip(ranks, names, h5indices, h5medians): - url_citations = h5index['href'] + for rank, name, h5index, h5median in zip( + ranks, names, h5indices, h5medians + ): + url_citations = h5index["href"] comment = "" if include_comments: soup = self.__nav._get_soup(url_citations) try: - for cmt in soup.find_all('ul', class_='gsc_mlhd_list')[1].find_all('li'): - comment += cmt.text+"; " + for cmt in soup.find_all("ul", class_="gsc_mlhd_list")[ + 1 + ].find_all("li"): + comment += cmt.text + "; " except IndexError: pass - result[int(rank.text[:-1])] = Journal(name=name.text, - h5_index=int(h5index.text), - h5_median=int(h5median.text), - url_citations=url_citations, - comment=comment - ) - #print(result) + result[int(rank.text[:-1])] = Journal( + name=name.text, + h5_index=int(h5index.text), + h5_median=int(h5median.text), + url_citations=url_citations, + comment=comment, + ) + # print(result) return result except KeyError: - raise ValueError("Invalid subcategory: %s for %s. Choose one from %s" % (subcategory, category, cat.keys())) + raise ValueError( + "Invalid subcategory: %s for %s. Choose one from %s" + % (subcategory, category, cat.keys()) + ) except KeyError: - raise ValueError("Invalid category: %s. Choose one from %s", category, self.journal_categories.keys()) - - def save_journals_csv(self, filename, category="English", subcategory=None, include_comments=False): - """ - Save a list of journals to a file in CSV format. - """ + raise ValueError( + "Invalid category: %s. Choose one from %s", + category, + self.journal_categories.keys(), + ) + + def save_journals_csv( + self, filename, category="English", subcategory=None, include_comments=False + ): + """Save a list of journals to a file in CSV format.""" journals = self.get_journals(category, subcategory, include_comments) try: - with open(filename, 'w') as f: + with open(filename, "w") as f: csv_writer = csv.writer(f) - header = ['Publication', 'h5-index', 'h5-median'] + ['Comment']*include_comments + header = ["Publication", "h5-index", "h5-median"] + [ + "Comment" + ] * include_comments csv_writer.writerow(header) for rank, journal in journals.items(): - row = [journal['name'], journal['h5_index'], journal['h5_median']] + [journal.get('comment', '')]*include_comments + row = [ + journal["name"], + journal["h5_index"], + journal["h5_median"], + ] + [journal.get("comment", "")] * include_comments csv_writer.writerow(row) - except IOError: + except OSError: self.logger.error("Error writing journals as %s", filename) finally: return journals diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py index 5e8791af..4385509c 100644 --- a/scholarly/author_parser.py +++ b/scholarly/author_parser.py @@ -1,193 +1,211 @@ -from .publication_parser import PublicationParser -import re -from .data_types import Author, AuthorSource, PublicationSource, PublicAccess import codecs +import re + +from .data_types import Author, AuthorSource, PublicAccess, PublicationSource +from .publication_parser import PublicationParser -_CITATIONAUTHRE = r'user=([\w-]*)' -_HOST = 'https://scholar.google.com{0}' +_CITATIONAUTHRE = r"user=([\w-]*)" +_HOST = "https://scholar.google.com{0}" _PAGESIZE = 100 -_EMAILAUTHORRE = r'Verified email at ' -_CITATIONAUTH = '/citations?hl=en&user={0}' -_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}' +_EMAILAUTHORRE = r"Verified email at " +_CITATIONAUTH = "/citations?hl=en&user={0}" +_COAUTH = "/citations?view_op=list_colleagues&hl=en&user={0}" _MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}" class AuthorParser: - """Returns an object for a single author""" + """Returns an object for a single author.""" def __init__(self, nav): self.nav = nav - self._sections = ['basics', - 'indices', - 'counts', - 'coauthors', - 'publications', - 'public_access'] - - def get_author(self, __data)->Author: - """ Fills the information for an author container - """ - author: Author = {'container_type': 'Author'} - author['filled'] = [] + self._sections = [ + "basics", + "indices", + "counts", + "coauthors", + "publications", + "public_access", + ] + + def get_author(self, __data) -> Author: + """Fills the information for an author container.""" + author: Author = {"container_type": "Author"} + author["filled"] = [] if isinstance(__data, str): - author['scholar_id'] = __data - author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE + author["scholar_id"] = __data + author["source"] = AuthorSource.AUTHOR_PROFILE_PAGE else: - author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS - author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0] + author["source"] = AuthorSource.SEARCH_AUTHOR_SNIPPETS + author["scholar_id"] = re.findall(_CITATIONAUTHRE, __data("a")[0]["href"])[ + 0 + ] - pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id']) - author['url_picture'] = _HOST.format(pic) + pic = "/citations?view_op=medium_photo&user={}".format(author["scholar_id"]) + author["url_picture"] = _HOST.format(pic) - name_class = self._find_tag_class_name(__data, 'h3', 'name') - author['name'] = __data.find('h3', class_=name_class).text + name_class = self._find_tag_class_name(__data, "h3", "name") + author["name"] = __data.find("h3", class_=name_class).text - aff_class = self._find_tag_class_name(__data, 'div', 'aff') - affiliation = __data.find('div', class_=aff_class) + aff_class = self._find_tag_class_name(__data, "div", "aff") + affiliation = __data.find("div", class_=aff_class) if affiliation: - author['affiliation'] = affiliation.text + author["affiliation"] = affiliation.text - email_class = self._find_tag_class_name(__data, 'div', 'eml') - email = __data.find('div', class_=email_class) + email_class = self._find_tag_class_name(__data, "div", "eml") + email = __data.find("div", class_=email_class) if email: - author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text) + author["email_domain"] = re.sub(_EMAILAUTHORRE, r"@", email.text) - int_class = self._find_tag_class_name(__data, 'a', 'one_int') + int_class = self._find_tag_class_name(__data, "a", "one_int") if int_class: - interests = __data.find_all('a', class_=int_class) - author['interests'] = [i.text.strip() for i in interests] + interests = __data.find_all("a", class_=int_class) + author["interests"] = [i.text.strip() for i in interests] else: - author['interests'] = [] + author["interests"] = [] - citedby_class = self._find_tag_class_name(__data, 'div', 'cby') - citedby = __data.find('div', class_=citedby_class) - if citedby and citedby.text != '': - author['citedby'] = int(citedby.text[9:]) + citedby_class = self._find_tag_class_name(__data, "div", "cby") + citedby = __data.find("div", class_=citedby_class) + if citedby and citedby.text != "": + author["citedby"] = int(citedby.text[9:]) return author - def _find_tag_class_name(self, __data, tag, text): elements = __data.find_all(tag) for element in elements: - if 'class' in element.attrs and text in element.attrs['class'][0]: - return element.attrs['class'][0] + if "class" in element.attrs and text in element.attrs["class"][0]: + return element.attrs["class"][0] def _fill_basics(self, soup, author): - author['name'] = soup.find('div', id='gsc_prf_in').text - if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: - res = soup.find('img', id='gsc_prf_pup-img') + author["name"] = soup.find("div", id="gsc_prf_in").text + if author["source"] == AuthorSource.AUTHOR_PROFILE_PAGE: + res = soup.find("img", id="gsc_prf_pup-img") if res is not None: - if "avatar_scholar" not in res['src']: - author['url_picture'] = res['src'] - elif author['source'] == AuthorSource.CO_AUTHORS_LIST: - picture = soup.find('img', id="gsc_prf_pup-img").get('src') + if "avatar_scholar" not in res["src"]: + author["url_picture"] = res["src"] + elif author["source"] == AuthorSource.CO_AUTHORS_LIST: + picture = soup.find("img", id="gsc_prf_pup-img").get("src") if "avatar_scholar" in picture: picture = _HOST.format(picture) - author['url_picture'] = picture + author["url_picture"] = picture - affiliation = soup.find('div', class_='gsc_prf_il') - author['affiliation'] = affiliation.text - affiliation_link = affiliation.find('a') + affiliation = soup.find("div", class_="gsc_prf_il") + author["affiliation"] = affiliation.text + affiliation_link = affiliation.find("a") if affiliation_link: - author['organization'] = int(affiliation_link.get('href').split("org=")[-1]) - author['interests'] = [i.text.strip() for i in - soup.find_all('a', class_='gsc_prf_inta')] - email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il") - if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: + author["organization"] = int(affiliation_link.get("href").split("org=")[-1]) + author["interests"] = [ + i.text.strip() for i in soup.find_all("a", class_="gsc_prf_inta") + ] + email = soup.find("div", id="gsc_prf_ivh", class_="gsc_prf_il") + if author["source"] == AuthorSource.AUTHOR_PROFILE_PAGE: if email.text != "No verified email": - author['email_domain'] = '@'+email.text.split(" ")[3] - homepage = email.find('a', class_="gsc_prf_ila") + author["email_domain"] = "@" + email.text.split(" ")[3] + homepage = email.find("a", class_="gsc_prf_ila") if homepage: - author['homepage'] = homepage.get('href') + author["homepage"] = homepage.get("href") - index = soup.find_all('td', class_='gsc_rsb_std') + index = soup.find_all("td", class_="gsc_rsb_std") if index: - author['citedby'] = int(index[0].text) + author["citedby"] = int(index[0].text) def _fill_indices(self, soup, author): - index = soup.find_all('td', class_='gsc_rsb_std') + index = soup.find_all("td", class_="gsc_rsb_std") if index: - author['citedby'] = int(index[0].text) - author['citedby5y'] = int(index[1].text) - author['hindex'] = int(index[2].text) - author['hindex5y'] = int(index[3].text) - author['i10index'] = int(index[4].text) - author['i10index5y'] = int(index[5].text) + author["citedby"] = int(index[0].text) + author["citedby5y"] = int(index[1].text) + author["hindex"] = int(index[2].text) + author["hindex5y"] = int(index[3].text) + author["i10index"] = int(index[4].text) + author["i10index5y"] = int(index[5].text) else: - author['hindex'] = 0 - author['hindex5y'] = 0 - author['i10index'] = 0 - author['i10index5y'] = 0 + author["hindex"] = 0 + author["hindex5y"] = 0 + author["i10index"] = 0 + author["i10index5y"] = 0 def _fill_counts(self, soup, author): - years = [int(y.text) - for y in soup.find_all('span', class_='gsc_g_t')] - cites = [int(c.text) - for c in soup.find_all('span', class_='gsc_g_al')] - author['cites_per_year'] = dict(zip(years, cites)) + years = [int(y.text) for y in soup.find_all("span", class_="gsc_g_t")] + cites = [int(c.text) for c in soup.find_all("span", class_="gsc_g_al")] + author["cites_per_year"] = dict(zip(years, cites)) def _fill_public_access(self, soup, author): - available = soup.find('div', class_='gsc_rsb_m_a') - not_available = soup.find('div', class_='gsc_rsb_m_na') + available = soup.find("div", class_="gsc_rsb_m_a") + not_available = soup.find("div", class_="gsc_rsb_m_na") n_available, n_not_available = 0, 0 if available: n_available = int(re.sub("[.,]", "", available.text.split(" ")[0])) if not_available: n_not_available = int(re.sub("[.,]", "", not_available.text.split(" ")[0])) - author["public_access"] = PublicAccess(available=n_available, - not_available=n_not_available) + author["public_access"] = PublicAccess( + available=n_available, not_available=n_not_available + ) - if 'publications' not in author['filled']: + if "publications" not in author["filled"]: return # Make a dictionary mapping to the publications - publications = {pub['author_pub_id']:pub for pub in author['publications']} - soup = self.nav._get_soup(_MANDATES.format(author['scholar_id'], _PAGESIZE)) + publications = {pub["author_pub_id"]: pub for pub in author["publications"]} + soup = self.nav._get_soup(_MANDATES.format(author["scholar_id"], _PAGESIZE)) while True: - rows = soup.find_all('div', 'gsc_mnd_sec_na') + rows = soup.find_all("div", "gsc_mnd_sec_na") if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): - author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", - row['data-href'])[0] + for row in rows[0].find_all( + "a", "gsc_mnd_art_rvw gs_nph gsc_mnd_link_font" + ): + author_pub_id = re.findall( + r"citation_for_view=([\w:-]*)", row["data-href"] + )[0] publications[author_pub_id]["public_access"] = False - rows = soup.find_all('div', 'gsc_mnd_sec_avl') + rows = soup.find_all("div", "gsc_mnd_sec_avl") if rows: - for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): - author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", - row['data-href'])[0] + for row in rows[0].find_all( + "a", "gsc_mnd_art_rvw gs_nph gsc_mnd_link_font" + ): + author_pub_id = re.findall( + r"citation_for_view=([\w:-]*)", row["data-href"] + )[0] publications[author_pub_id]["public_access"] = True next_button = soup.find(class_="gs_btnPR") if next_button and "disabled" not in next_button.attrs: - url = next_button['onclick'][17:-1] + url = next_button["onclick"][17:-1] url = codecs.getdecoder("unicode_escape")(url)[0] soup = self.nav._get_soup(url) else: break - - def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''): - author['publications'] = list() + def _fill_publications( + self, soup, author, publication_limit: int = 0, sortby_str: str = "" + ): + author["publications"] = list() pubstart = 0 - url_citations = _CITATIONAUTH.format(author['scholar_id']) + url_citations = _CITATIONAUTH.format(author["scholar_id"]) url_citations += sortby_str pub_parser = PublicationParser(self.nav) flag = False while True: - for row in soup.find_all('tr', class_='gsc_a_tr'): - new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY) - author['publications'].append(new_pub) - if (publication_limit) and (len(author['publications']) >= publication_limit): + for row in soup.find_all("tr", class_="gsc_a_tr"): + new_pub = pub_parser.get_publication( + row, PublicationSource.AUTHOR_PUBLICATION_ENTRY + ) + author["publications"].append(new_pub) + if (publication_limit) and ( + len(author["publications"]) >= publication_limit + ): flag = True break - if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag: + if ( + "disabled" not in soup.find("button", id="gsc_bpf_more").attrs + and not flag + ): pubstart += _PAGESIZE - url = '{0}&cstart={1}&pagesize={2}'.format( - url_citations, pubstart, _PAGESIZE) + url = "{}&cstart={}&pagesize={}".format( + url_citations, pubstart, _PAGESIZE + ) soup = self.nav._get_soup(url) else: break @@ -203,47 +221,48 @@ def _get_coauthors_short(self, soup): ----- This method is to be called by _fill_coauthors method. """ - coauthors = soup.find_all('span', class_='gsc_rsb_a_desc') - coauthor_ids = [re.findall(_CITATIONAUTHRE, - coauth('a')[0].get('href'))[0] - for coauth in coauthors] + coauthors = soup.find_all("span", class_="gsc_rsb_a_desc") + coauthor_ids = [ + re.findall(_CITATIONAUTHRE, coauth("a")[0].get("href"))[0] + for coauth in coauthors + ] - coauthor_names = [coauth.find(tabindex="-1").text - for coauth in coauthors] - coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text - for coauth in coauthors] + coauthor_names = [coauth.find(tabindex="-1").text for coauth in coauthors] + coauthor_affils = [ + coauth.find(class_="gsc_rsb_a_ext").text for coauth in coauthors + ] return coauthor_ids, coauthor_names, coauthor_affils def _get_coauthors_long(self, author): """Get the long (>20) list of coauthors. - This method fetches the complete list of coauthors bu opening a new + This method fetches the complete list of coauthors by opening a new page filled with the complete coauthor list. Note: ----- This method is to be called by _fill_coauthors method. """ - soup = self.nav._get_soup(_COAUTH.format(author['scholar_id'])) - coauthors = soup.find_all('div', 'gs_ai gs_scl') - coauthor_ids = [re.findall(_CITATIONAUTHRE, - coauth('a')[0].get('href'))[0] - for coauth in coauthors] + soup = self.nav._get_soup(_COAUTH.format(author["scholar_id"])) + coauthors = soup.find_all("div", "gs_ai gs_scl") + coauthor_ids = [ + re.findall(_CITATIONAUTHRE, coauth("a")[0].get("href"))[0] + for coauth in coauthors + ] coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors] - coauthor_affils = [coauth.find(class_="gs_ai_aff").text - for coauth in coauthors] + coauthor_affils = [coauth.find(class_="gs_ai_aff").text for coauth in coauthors] return coauthor_ids, coauthor_names, coauthor_affils def _fill_coauthors(self, soup, author): # If "View All" is not found, scrape the page for coauthors - if not soup.find_all('button', id='gsc_coauth_opn'): + if not soup.find_all("button", id="gsc_coauth_opn"): coauthor_info = self._get_coauthors_short(soup) else: - # If "View All" is found, try opening the dialog box. - # If geckodriver is not installed, resort to a short list and warn. + # If "View All" is found, try opening the dialog box. + # If geckodriver is not installed, resort to a short list and warn. try: coauthor_info = self._get_coauthors_long(author) except Exception as err: @@ -251,16 +270,18 @@ def _fill_coauthors(self, soup, author): self.nav.logger.warning(err) self.nav.logger.warning("Fetching only the top 20 coauthors") - author['coauthors'] = [] + author["coauthors"] = [] for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info): new_coauthor = self.get_author(coauth_id) - new_coauthor['name'] = coauth_name - new_coauthor['affiliation'] = coauth_affil - new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST - author['coauthors'].append(new_coauthor) + new_coauthor["name"] = coauth_name + new_coauthor["affiliation"] = coauth_affil + new_coauthor["source"] = AuthorSource.CO_AUTHORS_LIST + author["coauthors"].append(new_coauthor) - def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0): - """Populate the Author with information from their profile + def fill( + self, author, sections: list = [], sortby="citedby", publication_limit: int = 0 + ): + """Populate the Author with information from their profile. The `sections` argument allows for finer granularity of the profile information to be pulled. @@ -279,7 +300,7 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit: :type sortby: string :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit. :type publication_limit: int - :returns: The filled object if fill was successfull, False otherwise. + :returns: The filled object if fill was successful, False otherwise. :rtype: Author or bool :Example:: @@ -425,32 +446,48 @@ def fill(self, author, sections: list = [], sortby="citedby", publication_limit: """ try: sections = [section.lower() for section in sections] - sections.sort(reverse=True) # Ensure 'publications' comes before 'public_access' - sortby_str = '' + sections.sort( + reverse=True + ) # Ensure 'publications' comes before 'public_access' + sortby_str = "" if sortby == "year": - sortby_str = '&view_op=list_works&sortby=pubdate' + sortby_str = "&view_op=list_works&sortby=pubdate" elif sortby != "citedby": - raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'") - url_citations = _CITATIONAUTH.format(author['scholar_id']) + raise Exception( + "Please enter a valid sortby parameter. Options: 'year', 'citedby'" + ) + url_citations = _CITATIONAUTH.format(author["scholar_id"]) url_citations += sortby_str - url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE) + url = f"{url_citations}&pagesize={_PAGESIZE}" soup = self.nav._get_soup(url) if sections == []: for i in self._sections: - if i not in author['filled']: - (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) - author['filled'].append(i) + if i not in author["filled"]: + ( + getattr(self, f"_fill_{i}")(soup, author) + if i != "publications" + else getattr(self, f"_fill_{i}")( + soup, author, publication_limit, sortby_str + ) + ) + author["filled"].append(i) else: for i in sections: - if i in self._sections and i not in author['filled']: - (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) - author['filled'].append(i) + if i in self._sections and i not in author["filled"]: + ( + getattr(self, f"_fill_{i}")(soup, author) + if i != "publications" + else getattr(self, f"_fill_{i}")( + soup, author, publication_limit, sortby_str + ) + ) + author["filled"].append(i) except Exception as e: - raise(e) + raise (e) return author - def __repr__(self): + """Printout.""" return self.__str__() diff --git a/scholarly/data_types.py b/scholarly/data_types.py index dade6bae..ceadc45a 100644 --- a/scholarly/data_types.py +++ b/scholarly/data_types.py @@ -1,7 +1,6 @@ import sys - from enum import Enum -from typing import List, Dict +from typing import Dict, List if sys.version_info >= (3, 8): from typing import TypedDict @@ -10,9 +9,9 @@ class PublicationSource(str, Enum): - ''' - Defines the source of the publication. In general, a publication - on Google Scholar has two forms: + """Defines the source of the publication. + + In general, a publication on Google Scholar has two forms: * Appearing as a PUBLICATION SNIPPET and * Appearing as a paper in an AUTHOR PAGE @@ -49,7 +48,7 @@ class PublicationSource(str, Enum): We also have publications that appear in the "author pages" of Google Scholar. These publications are often a set of publications "merged" together. - The snippet version of these publications conains the title of the publication, + The snippet version of these publications contains the title of the publication, a subset of the authors, the (sometimes truncated) venue, and the year of the publication and the number of papers that cite the publication. @@ -58,31 +57,31 @@ class PublicationSource(str, Enum): To fill in the publication, we open the "detailed view" of the paper Detailed view page: https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-Km63D4AAAAJ:d1gkVwhDpl0C - ''' + """ + PUBLICATION_SEARCH_SNIPPET = "PUBLICATION_SEARCH_SNIPPET" AUTHOR_PUBLICATION_ENTRY = "AUTHOR_PUBLICATION_ENTRY" JOURNAL_CITATION_LIST = "JOURNAL_CITATION_LIST" class AuthorSource(str, Enum): - ''' - Defines the source of the HTML that will be parsed. + """Defines the source of the HTML that will be parsed. Author page: https://scholar.google.com/citations?hl=en&user=yxUduqMAAAAJ Search authors: https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=jordan&btnG= Coauthors: From the list of co-authors from an Author page - ''' + """ + AUTHOR_PROFILE_PAGE = "AUTHOR_PROFILE_PAGE" SEARCH_AUTHOR_SNIPPETS = "SEARCH_AUTHOR_SNIPPETS" CO_AUTHORS_LIST = "CO_AUTHORS_LIST" class ProxyMode(str, Enum): - """ - Defines the different types supported. - """ + """Defines the different types supported.""" + FREE_PROXIES = "FREE_PROXIES" SCRAPERAPI = "SCRAPERAPI" LUMINATI = "LUMINATI" @@ -92,21 +91,21 @@ class ProxyMode(str, Enum): TOR_INTERNAL = "TOR_INTERNAL" -''' Lightweight Data Structure to keep distribution of citations of the years ''' +""" Lightweight Data Structure to keep distribution of citations of the years. """ CitesPerYear = Dict[int, int] -''' Lightweight Data Structure to hold the numbers articles available or - not available publicly according to funding mandates -''' -PublicAccess = TypedDict('PublicAccess', {"available": int, "not_available": int}) +class PublicAccess(TypedDict): + """Lightweight Data Structure to hold the numbers articles available or not available publicly according to funding mandates.""" + + available: int + not_available: int class BibEntry(TypedDict, total=False): - """ - :class:`BibEntry ` The bibliographic entry for a publication - (When source is not specified, the field is present in all sources) + """:class:`BibEntry ` The bibliographic entry for a publication. + (When source is not specified, the field is present in all sources) :param pub_type: the type of entry for this bib (for example 'article') (source: PUBLICATION_SEARCH_SNIPPET) :param bib_id: bib entry id (source: PUBLICATION_SEARCH_SNIPPET) :param abstract: description of the publication @@ -122,6 +121,7 @@ class BibEntry(TypedDict, total=False): :param citation: Formatted citation string, usually containing journal name, volume and page numbers (source: AUTHOR_PUBLICATION_ENTRY) :param pub_url: url of the website providing the publication """ + pub_type: str bib_id: str abstract: str @@ -138,8 +138,7 @@ class BibEntry(TypedDict, total=False): class Mandate(TypedDict, total=False): - """ - :class:`Mandate ` A funding mandate for a given year + """:class:`Mandate ` A funding mandate for a given year. :param agency: name of the funding agency :param url_policy: url of the policy for this mandate @@ -149,6 +148,7 @@ class Mandate(TypedDict, total=False): :param acknowledgement: text in the paper acknowledging the funding :param grant: grant ID that supported this work """ + agency: str url_policy: str url_policy_cached: str @@ -159,10 +159,9 @@ class Mandate(TypedDict, total=False): class Publication(TypedDict, total=False): - """ - :class:`Publication ` object used to represent a publication entry on Google Scholar. - (When source is not specified, the field is present in all sources) + """:class:`Publication ` object used to represent a publication entry on Google Scholar. + (When source is not specified, the field is present in all sources) :param BibEntryCitation: contains additional information about the publication :param gsrank: position of the publication in the query (source: PUBLICATION_SEARCH_SNIPPET) :param author_id: list of the corresponding author ids of the authors that contributed to the Publication (source: PUBLICATION_SEARCH_SNIPPET) @@ -183,7 +182,7 @@ class Publication(TypedDict, total=False): the "citedby_url" will be a comma-separated list of values. It is also used to return the "cluster" of all the different versions of the paper. https://scholar.google.com/scholar?cluster=16766804411681372720&hl=en - :param cites_per_year: a dictionay containing the number of citations per year for this Publication + :param cites_per_year: a dictionary containing the number of citations per year for this Publication (source: AUTHOR_PUBLICATION_ENTRY) :param eprint_url: digital version of the Publication. Usually it is a pdf. :param pub_url: url of the website providing the publication @@ -224,11 +223,11 @@ class Publication(TypedDict, total=False): source: PublicationSource container_type: str + class Author(TypedDict, total=False): - """ - :class:`Author ` object used to represent an author entry on Google Scholar. - (When source is not specified, the field is present in all sources) + """:class:`Author ` object used to represent an author entry on Google Scholar. + (When source is not specified, the field is present in all sources) :param scholar_id: The id of the author on Google Scholar :param name: The name of the author :param affiliation: The affiliation of the author @@ -271,16 +270,15 @@ class Author(TypedDict, total=False): cites_per_year: CitesPerYear public_access: PublicAccess publications: List[Publication] - coauthors: List # List of authors. No self dict functionality available + coauthors: List # List of authors. No self dict functionality available container_type: str source: AuthorSource -class Journal(TypedDict, total=False): - """ - :class:`Journal ` object used to represent a journal entry on Google Scholar. - (When source is not specified, the field is present in all sources) +class Journal(TypedDict, total=False): + """:class:`Journal ` object used to represent a journal entry on Google Scholar. + (When source is not specified, the field is present in all sources) :param name: The name of the journal :param h5-index: h5-index is the h-index for articles published in the journal during the last 5 complete years. :param h5-median: h5-median for a publication is the median number of citations for the articles that make up its h5-index. diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index ec8132e8..7eab9267 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -1,54 +1,63 @@ import re -import bibtexparser + import arrow +import bibtexparser from bibtexparser.bibdatabase import BibDatabase -from .data_types import BibEntry, Mandate, Publication, PublicationSource +from .data_types import BibEntry, Mandate, Publication, PublicationSource -_SCHOLARPUBRE = r'cites=([\d,]*)' -_CITATIONPUB = '/citations?hl=en&view_op=view_citation&citation_for_view={0}' -_SCHOLARPUB = '/scholar?hl=en&oi=bibs&cites={0}' -_CITATIONPUBRE = r'citation_for_view=([\w-]*:[\w-]*)' -_BIBCITE = '/scholar?hl=en&q=info:{0}:scholar.google.com/\ -&output=cite&scirp={1}&hl=en' -_CITEDBYLINK = '/scholar?hl=en&cites={0}' -_MANDATES_URL = '/citations?view_op=view_mandate&hl=en&citation_for_view={0}' +_SCHOLARPUBRE = r"cites=([\d,]*)" +_CITATIONPUB = "/citations?hl=en&view_op=view_citation&citation_for_view={0}" +_SCHOLARPUB = "/scholar?hl=en&oi=bibs&cites={0}" +_CITATIONPUBRE = r"citation_for_view=([\w-]*:[\w-]*)" +_BIBCITE = "/scholar?hl=en&q=info:{0}:scholar.google.com/\ +&output=cite&scirp={1}&hl=en" +_CITEDBYLINK = "/scholar?hl=en&cites={0}" +_MANDATES_URL = "/citations?view_op=view_mandate&hl=en&citation_for_view={0}" _BIB_MAPPING = { - 'ENTRYTYPE': 'pub_type', - 'ID': 'bib_id', - 'year': 'pub_year', + "ENTRYTYPE": "pub_type", + "ID": "bib_id", + "year": "pub_year", } _BIB_DATATYPES = { - 'number': 'str', - 'volume': 'str', + "number": "str", + "volume": "str", } _BIB_REVERSE_MAPPING = { - 'pub_type': 'ENTRYTYPE', - 'bib_id': 'ID', + "pub_type": "ENTRYTYPE", + "bib_id": "ID", } -def remap_bib(parsed_bib: dict, mapping: dict, data_types:dict ={}) -> BibEntry: + +def remap_bib(parsed_bib: dict, mapping: dict, data_types: dict = {}) -> BibEntry: + """Remaps a bib entry.""" for key, value in mapping.items(): if key in parsed_bib: parsed_bib[value] = parsed_bib.pop(key) for key, value in data_types.items(): if key in parsed_bib: - if value == 'int': + if value == "int": parsed_bib[key] = int(parsed_bib[key]) return parsed_bib -class _SearchScholarIterator(object): - """Iterator that returns Publication objects from the search page + +class _SearchScholarIterator: + """Iterator that returns Publication objects from the search page. + I have removed all logging from here for simplicity. -V """ def __init__(self, nav, url: str): self._url = url - self._pubtype = PublicationSource.PUBLICATION_SEARCH_SNIPPET if "/scholar?" in url else PublicationSource.JOURNAL_CITATION_LIST + self._pubtype = ( + PublicationSource.PUBLICATION_SEARCH_SNIPPET + if "/scholar?" in url + else PublicationSource.JOURNAL_CITATION_LIST + ) self._nav = nav self._load_url(url) self.total_results = self._get_total_results() @@ -58,18 +67,20 @@ def _load_url(self, url: str): # this is temporary until setup json file self._soup = self._nav._get_soup(url) self._pos = 0 - self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gsc_mpat_ttl') + self._rows = self._soup.find_all( + "div", class_="gs_r gs_or gs_scl" + ) + self._soup.find_all("div", class_="gsc_mpat_ttl") def _get_total_results(self): if self._soup.find("div", class_="gs_pda"): return None - for x in self._soup.find_all('div', class_='gs_ab_mdw'): + for x in self._soup.find_all("div", class_="gs_ab_mdw"): # Accounting for different thousands separators: # comma, dot, space, apostrophe - match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.\s’]+)', string=x.text) + match = re.match(pattern=r"(^|\s*About)\s*([0-9,\.\s’]+)", string=x.text) if match: - return int(re.sub(pattern=r'[,\.\s’]',repl='', string=match.group(2))) + return int(re.sub(pattern=r"[,\.\s’]", repl="", string=match.group(2))) return 0 # Iterator protocol @@ -83,9 +94,8 @@ def __next__(self): self._pos += 1 res = self.pub_parser.get_publication(row, self._pubtype) return res - elif self._soup.find(class_='gs_ico gs_ico_nav_next'): - url = self._soup.find( - class_='gs_ico gs_ico_nav_next').parent['href'] + elif self._soup.find(class_="gs_ico gs_ico_nav_next"): + url = self._soup.find(class_="gs_ico gs_ico_nav_next").parent["href"] self._url = url self._load_url(url) return self.__next__() @@ -94,62 +104,61 @@ def __next__(self): # Pickle protocol def __getstate__(self): - return {'url': self._url, 'pos': self._pos} + return {"url": self._url, "pos": self._pos} def __setstate__(self, state): # this needs validation -V - self._load_url(state['url']) - self._pos = state['pos'] + self._load_url(state["url"]) + self._pos = state["pos"] -class PublicationParser(object): - """Returns an object for a single publication""" +class PublicationParser: + """Returns an object for a single publication.""" def __init__(self, nav): self.nav = nav def _citation_pub(self, __data, publication: Publication): # create the bib entry in the dictionary - publication['bib']['title'] = __data.find('a', class_='gsc_a_at').text - publication['author_pub_id'] = re.findall(_CITATIONPUBRE, __data.find( - 'a', class_='gsc_a_at')['href'])[0] - citedby = __data.find(class_='gsc_a_ac') + publication["bib"]["title"] = __data.find("a", class_="gsc_a_at").text + publication["author_pub_id"] = re.findall( + _CITATIONPUBRE, __data.find("a", class_="gsc_a_at")["href"] + )[0] + citedby = __data.find(class_="gsc_a_ac") publication["num_citations"] = 0 - if citedby and not (citedby.text.isspace() or citedby.text == ''): + if citedby and not (citedby.text.isspace() or citedby.text == ""): publication["num_citations"] = int(citedby.text.strip()) publication["citedby_url"] = citedby["href"] - publication["cites_id"] = re.findall(_SCHOLARPUBRE, citedby["href"])[0].split(',') + publication["cites_id"] = re.findall(_SCHOLARPUBRE, citedby["href"])[ + 0 + ].split(",") - year = __data.find(class_='gsc_a_h') - if (year and year.text - and not year.text.isspace() - and len(year.text) > 0): - publication['bib']['pub_year'] = year.text.strip() + year = __data.find(class_="gsc_a_h") + if year and year.text and not year.text.isspace() and len(year.text) > 0: + publication["bib"]["pub_year"] = year.text.strip() - author_citation = __data.find_all('div', class_='gs_gray') + author_citation = __data.find_all("div", class_="gs_gray") try: citation = author_citation[1].text except IndexError: citation = "" - publication['bib']['citation'] = citation + publication["bib"]["citation"] = citation return publication - def get_publication(self, __data, pubtype: PublicationSource)->Publication: - """Returns a publication that has either 'citation' or 'scholar' source - """ + def get_publication(self, __data, pubtype: PublicationSource) -> Publication: + """Returns a publication that has either 'citation' or 'scholar' source.""" + publication: Publication = {"container_type": "Publication"} + publication["source"] = pubtype + publication["bib"] = {} + publication["filled"] = False - publication: Publication = {'container_type': 'Publication'} - publication['source'] = pubtype - publication['bib'] = {} - publication['filled'] = False - - if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + if publication["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: return self._citation_pub(__data, publication) - elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + elif publication["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: return self._scholar_pub(__data, publication) - elif publication['source'] == PublicationSource.JOURNAL_CITATION_LIST: + elif publication["source"] == PublicationSource.JOURNAL_CITATION_LIST: return publication # TODO: self._journal_pub(__data, publication) else: @@ -157,26 +166,32 @@ def get_publication(self, __data, pubtype: PublicationSource)->Publication: def _get_authorlist(self, authorinfo): authorlist = list() - text = authorinfo.split(' - ')[0] - for i in text.split(','): + text = authorinfo.split(" - ")[0] + for i in text.split(","): i = i.strip() - if bool(re.search(r'\d', i)): + if bool(re.search(r"\d", i)): continue - if ("Proceedings" in i or "Conference" in i or "Journal" in i or - "(" in i or ")" in i or "[" in i or "]" in i or - "Transactions" in i): + if ( + "Proceedings" in i + or "Conference" in i + or "Journal" in i + or "(" in i + or ")" in i + or "[" in i + or "]" in i + or "Transactions" in i + ): continue i = i.replace("…", "") authorlist.append(i) return authorlist - def _get_author_id_list(self, authorinfo_inner_html): author_id_list = list() - html = authorinfo_inner_html.split(' - ')[0] - for author_html in html.split(','): + html = authorinfo_inner_html.split(" - ")[0] + for author_html in html.split(","): author_html = author_html.strip() - match = re.search('\\?user=(.*?)&', author_html) + match = re.search("\\?user=(.*?)&", author_html) if match: author_id_list.append(match.groups()[0]) else: @@ -184,29 +199,29 @@ def _get_author_id_list(self, authorinfo_inner_html): return author_id_list def _scholar_pub(self, __data, publication: Publication): - databox = __data.find('div', class_='gs_ri') - title = databox.find('h3', class_='gs_rt') + databox = __data.find("div", class_="gs_ri") + title = databox.find("h3", class_="gs_rt") - cid = __data.get('data-cid') - pos = __data.get('data-rp') + cid = __data.get("data-cid") + pos = __data.get("data-rp") - publication['gsrank'] = int(pos) + 1 + publication["gsrank"] = int(pos) + 1 - if title.find('span', class_='gs_ctu'): # A citation + if title.find("span", class_="gs_ctu"): # A citation title.span.extract() - elif title.find('span', class_='gs_ctc'): # A book or PDF + elif title.find("span", class_="gs_ctc"): # A book or PDF title.span.extract() - publication['bib']['title'] = title.text.strip() + publication["bib"]["title"] = title.text.strip() - if title.find('a'): - publication['pub_url'] = title.find('a')['href'] + if title.find("a"): + publication["pub_url"] = title.find("a")["href"] - author_div_element = databox.find('div', class_='gs_a') + author_div_element = databox.find("div", class_="gs_a") authorinfo = author_div_element.text - authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP - authorinfo = authorinfo.replace(u'&', u'&') # Ampersand - publication['bib']["author"] = self._get_authorlist(authorinfo) + authorinfo = authorinfo.replace("\xa0", " ") # NBSP + authorinfo = authorinfo.replace("&", "&") # Ampersand + publication["bib"]["author"] = self._get_authorlist(authorinfo) authorinfo_html = author_div_element.decode_contents() publication["author_id"] = self._get_author_id_list(authorinfo_html) @@ -219,163 +234,189 @@ def _scholar_pub(self, __data, publication: Publication): # the middle venue/year part. In principle the venue is separated # from the year by a comma. However, there exist venues with commas # and as shown above there might not always be a venue AND a year... - venueyear = authorinfo.split(' - ') + venueyear = authorinfo.split(" - ") # If there is no middle part (A) then venue and year are unknown. if len(venueyear) <= 2: - publication['bib']['venue'], publication['bib']['pub_year'] = 'NA', 'NA' + publication["bib"]["venue"], publication["bib"]["pub_year"] = "NA", "NA" else: - venueyear = venueyear[1].split(',') - venue = 'NA' + venueyear = venueyear[1].split(",") + venue = "NA" year = venueyear[-1].strip() if year.isnumeric() and len(year) == 4: - publication['bib']['pub_year'] = year + publication["bib"]["pub_year"] = year if len(venueyear) >= 2: - venue = ','.join(venueyear[0:-1]) # everything but last + venue = ",".join(venueyear[0:-1]) # everything but last else: - venue = ','.join(venueyear) # everything - publication['bib']['pub_year'] = 'NA' - publication['bib']['venue'] = venue - - if databox.find('div', class_='gs_rs'): - publication['bib']['abstract'] = databox.find('div', class_='gs_rs').text - publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\u2026', u'') - publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\n', u' ') - publication['bib']['abstract'] = publication['bib']['abstract'].strip() - - if publication['bib']['abstract'][0:8].lower() == 'abstract': - publication['bib']['abstract'] = publication['bib']['abstract'][9:].strip() - - publication['url_scholarbib'] = _BIBCITE.format(cid, pos) + venue = ",".join(venueyear) # everything + publication["bib"]["pub_year"] = "NA" + publication["bib"]["venue"] = venue + + if databox.find("div", class_="gs_rs"): + publication["bib"]["abstract"] = databox.find("div", class_="gs_rs").text + publication["bib"]["abstract"] = publication["bib"]["abstract"].replace( + "\u2026", "" + ) + publication["bib"]["abstract"] = publication["bib"]["abstract"].replace( + "\n", " " + ) + publication["bib"]["abstract"] = publication["bib"]["abstract"].strip() + + if publication["bib"]["abstract"][0:8].lower() == "abstract": + publication["bib"]["abstract"] = publication["bib"]["abstract"][ + 9: + ].strip() + + publication["url_scholarbib"] = _BIBCITE.format(cid, pos) sclib = self.nav.publib.format(id=cid) - publication['url_add_sclib'] = sclib + publication["url_add_sclib"] = sclib - lowerlinks = databox.find('div', class_='gs_fl').find_all('a') + lowerlinks = databox.find("div", class_="gs_fl").find_all("a") publication["num_citations"] = 0 for link in lowerlinks: - if 'Cited by' in link.text: - publication['num_citations'] = int(re.findall(r'\d+', link.text)[0].strip()) - publication['citedby_url'] = link['href'] - - if 'Related articles' in link.text: - publication['url_related_articles'] = link['href'] - - if __data.find('div', class_='gs_ggs gs_fl'): - publication['eprint_url'] = __data.find( - 'div', class_='gs_ggs gs_fl').a['href'] + if "Cited by" in link.text: + publication["num_citations"] = int( + re.findall(r"\d+", link.text)[0].strip() + ) + publication["citedby_url"] = link["href"] + + if "Related articles" in link.text: + publication["url_related_articles"] = link["href"] + + if __data.find("div", class_="gs_ggs gs_fl"): + publication["eprint_url"] = __data.find("div", class_="gs_ggs gs_fl").a[ + "href" + ] return publication - - def fill(self, publication: Publication)->Publication: - """Populate the Publication with information from its profile + def fill(self, publication: Publication) -> Publication: + """Populate the Publication with information from its profile. :param publication: Scholar or Citation publication container object that is not filled :type publication: PublicationCitation or PublicationScholar """ - if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: - url = _CITATIONPUB.format(publication['author_pub_id']) + if publication["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: + url = _CITATIONPUB.format(publication["author_pub_id"]) soup = self.nav._get_soup(url) - publication['bib']['title'] = soup.find('div', id='gsc_oci_title').text - if publication['bib']['title'][-1] == '\u2026': - merged_snippet = soup.find('div', class_='gsc_oci_merged_snippet') + publication["bib"]["title"] = soup.find("div", id="gsc_oci_title").text + if publication["bib"]["title"][-1] == "\u2026": + merged_snippet = soup.find("div", class_="gsc_oci_merged_snippet") if merged_snippet: - title_div = merged_snippet.find('div') + title_div = merged_snippet.find("div") if title_div: - publication['bib']['title'] = title_div.text - if soup.find('a', class_='gsc_oci_title_link'): - publication['pub_url'] = soup.find( - 'a', class_='gsc_oci_title_link')['href'] - for item in soup.find_all('div', class_='gs_scl'): - key = item.find(class_='gsc_oci_field').text.strip().lower() - val = item.find(class_='gsc_oci_value') - if key == 'authors' or key == 'inventors': - publication['bib']['author'] = ' and '.join( - [i.strip() for i in val.text.split(',')]) - elif key == 'journal': - publication['bib']['journal'] = val.text - elif key == 'conference': - publication['bib']['conference'] = val.text - elif key == 'volume': - publication['bib']['volume'] = val.text - elif key == 'issue': - publication['bib']['number'] = val.text - elif key == 'pages': - publication['bib']['pages'] = val.text - elif key == 'publisher': - publication['bib']['publisher'] = val.text - elif key == 'publication date': - - patterns = ['YYYY/M', - 'YYYY/MM/DD', - 'YYYY', - 'YYYY/M/DD', - 'YYYY/M/D', - 'YYYY/MM/D'] - publication['bib']['pub_year'] = arrow.get(val.text, patterns).year - elif key == 'description': + publication["bib"]["title"] = title_div.text + if soup.find("a", class_="gsc_oci_title_link"): + publication["pub_url"] = soup.find("a", class_="gsc_oci_title_link")[ + "href" + ] + for item in soup.find_all("div", class_="gs_scl"): + key = item.find(class_="gsc_oci_field").text.strip().lower() + val = item.find(class_="gsc_oci_value") + if key == "authors" or key == "inventors": + publication["bib"]["author"] = " and ".join( + [i.strip() for i in val.text.split(",")] + ) + elif key == "journal": + publication["bib"]["journal"] = val.text + elif key == "conference": + publication["bib"]["conference"] = val.text + elif key == "volume": + publication["bib"]["volume"] = val.text + elif key == "issue": + publication["bib"]["number"] = val.text + elif key == "pages": + publication["bib"]["pages"] = val.text + elif key == "publisher": + publication["bib"]["publisher"] = val.text + elif key == "publication date": + + patterns = [ + "YYYY/M", + "YYYY/MM/DD", + "YYYY", + "YYYY/M/DD", + "YYYY/M/D", + "YYYY/MM/D", + ] + publication["bib"]["pub_year"] = arrow.get(val.text, patterns).year + elif key == "description": # try to find all the gsh_csp if they exist - abstract = val.find_all(class_='gsh_csp') + abstract = val.find_all(class_="gsh_csp") result = "" # append all gsh_csp together as there can be multiple in certain scenarios for item in abstract: - if item.text[0:8].lower() == 'abstract': + if item.text[0:8].lower() == "abstract": result += item.text[9:].strip() else: result += item.text if len(abstract) == 0: # if no gsh_csp were found - abstract = val.find(class_='gsh_small') + abstract = val.find(class_="gsh_small") if abstract: - if abstract.text[0:8].lower() == 'abstract': + if abstract.text[0:8].lower() == "abstract": result = abstract.text[9:].strip() else: result = abstract.text else: - result = ' '.join([description_part for description_part in val]) - - publication['bib']['abstract'] = result - elif key == 'total citations': - publication['cites_id'] = re.findall( - _SCHOLARPUBRE, val.a['href'])[0].split(',') - publication['citedby_url'] = _CITEDBYLINK.format(','.join(publication['cites_id'])) - elif key == 'scholar articles': - for entry in val.find_all('a'): - if entry.text.lower() == 'related articles': - publication['url_related_articles'] = entry.get('href')[26:] + result = " ".join( + [description_part for description_part in val] + ) + + publication["bib"]["abstract"] = result + elif key == "total citations": + publication["cites_id"] = re.findall(_SCHOLARPUBRE, val.a["href"])[ + 0 + ].split(",") + publication["citedby_url"] = _CITEDBYLINK.format( + ",".join(publication["cites_id"]) + ) + elif key == "scholar articles": + for entry in val.find_all("a"): + if entry.text.lower() == "related articles": + publication["url_related_articles"] = entry.get("href")[26:] # number of citation per year - years = [int(y.text) for y in soup.find_all(class_='gsc_oci_g_t')] - cites = [int(c.text) for c in soup.find_all(class_='gsc_oci_g_al')] - cites_year = [int(c.get('href')[-4:]) for c in soup.find_all(class_='gsc_oci_g_a')] + years = [int(y.text) for y in soup.find_all(class_="gsc_oci_g_t")] + cites = [int(c.text) for c in soup.find_all(class_="gsc_oci_g_al")] + cites_year = [ + int(c.get("href")[-4:]) for c in soup.find_all(class_="gsc_oci_g_a") + ] nonzero_cites_per_year = dict(zip(cites_year, cites)) res_dict = {} for year in years: - res_dict[year] = (nonzero_cites_per_year[year] if year in nonzero_cites_per_year else 0) - publication['cites_per_year'] = res_dict - - if soup.find('div', class_='gsc_vcd_title_ggi'): - publication['eprint_url'] = soup.find( - 'div', class_='gsc_vcd_title_ggi').a['href'] - - if publication.get('public_access', None): - publication['mandates'] = [] + res_dict[year] = ( + nonzero_cites_per_year[year] + if year in nonzero_cites_per_year + else 0 + ) + publication["cites_per_year"] = res_dict + + if soup.find("div", class_="gsc_vcd_title_ggi"): + publication["eprint_url"] = soup.find( + "div", class_="gsc_vcd_title_ggi" + ).a["href"] + + if publication.get("public_access", None): + publication["mandates"] = [] self._fill_public_access_mandates(publication) - publication['filled'] = True - elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: - bibtex_url = self._get_bibtex(publication['url_scholarbib']) + publication["filled"] = True + elif publication["source"] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: + bibtex_url = self._get_bibtex(publication["url_scholarbib"]) bibtex = self.nav._get_page(bibtex_url) parser = bibtexparser.bparser.BibTexParser(common_strings=True) - parsed_bib = remap_bib(bibtexparser.loads(bibtex,parser).entries[-1], _BIB_MAPPING, _BIB_DATATYPES) - publication['bib'].update(parsed_bib) - publication['filled'] = True + parsed_bib = remap_bib( + bibtexparser.loads(bibtex, parser).entries[-1], + _BIB_MAPPING, + _BIB_DATATYPES, + ) + publication["bib"].update(parsed_bib) + publication["filled"] = True return publication - def citedby(self, publication: Publication) -> _SearchScholarIterator or list: - """Searches Google Scholar for other articles that cite this Publication and - returns a Publication generator. + """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. :param publication: Scholar or Citation publication container object :type publication: Publication @@ -383,12 +424,12 @@ def citedby(self, publication: Publication) -> _SearchScholarIterator or list: :getter: Returns a Generator of Publications that cited the current. :type: Iterator[:class:`Publication`] """ - if not publication['filled']: + if not publication["filled"]: publication = self.fill(publication) - return _SearchScholarIterator(self.nav, publication['citedby_url']) + return _SearchScholarIterator(self.nav, publication["citedby_url"]) def bibtex(self, publication: Publication) -> str: - """Returns the publication as a Bibtex entry + """Returns the publication as a Bibtex entry. :param publication: Scholar or Citation publication container object :type publication: Publication @@ -396,10 +437,10 @@ def bibtex(self, publication: Publication) -> str: :getter: Returns a Bibtex entry in text format :type: str """ - if not publication['filled']: + if not publication["filled"]: publication = self.fill(publication) a = BibDatabase() - converted_dict = publication['bib'] + converted_dict = publication["bib"] converted_dict = remap_bib(converted_dict, _BIB_REVERSE_MAPPING) str_dict = {key: str(value) for key, value in converted_dict.items()} # convert every key of the dictionary to string to be Bibtex compatible @@ -407,40 +448,50 @@ def bibtex(self, publication: Publication) -> str: return bibtexparser.dumps(a) def _get_bibtex(self, bib_url) -> str: - """Retrieves the bibtex url""" - + """Retrieves the bibtex url.""" soup = self.nav._get_soup(bib_url) - styles = soup.find_all('a', class_='gs_citi') + styles = soup.find_all("a", class_="gs_citi") for link in styles: if link.string.lower() == "bibtex": - return link.get('href') - return '' + return link.get("href") + return "" def _fill_public_access_mandates(self, publication: Publication) -> None: - """Fills the public access mandates""" - if publication.get('public_access', None): - soup = self.nav._get_soup(_MANDATES_URL.format(publication['author_pub_id'])) - mandates = soup.find_all('li') + """Fills the public access mandates.""" + if publication.get("public_access", None): + soup = self.nav._get_soup( + _MANDATES_URL.format(publication["author_pub_id"]) + ) + mandates = soup.find_all("li") for mandate in mandates: m = Mandate() - m['agency'] = mandate.find('span', class_='gsc_md_mndt_name').text - m['url_policy'] = mandate.find('div', class_='gsc_md_mndt_title').a['href'] - m['url_policy_cached'] = mandate.find('span', class_='gs_a').a['href'] - for desc in mandate.find_all('div', class_='gsc_md_mndt_desc'): + m["agency"] = mandate.find("span", class_="gsc_md_mndt_name").text + m["url_policy"] = mandate.find("div", class_="gsc_md_mndt_title").a[ + "href" + ] + m["url_policy_cached"] = mandate.find("span", class_="gs_a").a["href"] + for desc in mandate.find_all("div", class_="gsc_md_mndt_desc"): match = re.search("Effective date: [0-9]{4}/[0-9]{1,2}", desc.text) if match: - m['effective_date'] = re.sub(pattern="Effective date: ", repl="", - string=desc.text[match.start() : match.end()]) + m["effective_date"] = re.sub( + pattern="Effective date: ", + repl="", + string=desc.text[match.start() : match.end()], + ) match = re.search("Embargo: ", desc.text) if match: - m['embargo'] = re.sub(pattern="Embargo: ", repl="", string=desc.text[match.end():]) + m["embargo"] = re.sub( + pattern="Embargo: ", + repl="", + string=desc.text[match.end() :], + ) if "Grant: " in desc.text: - m['grant'] = desc.text.split("Grant: ")[1] + m["grant"] = desc.text.split("Grant: ")[1] if "Funding acknowledgment" in desc.text: - m['acknowledgement'] = desc.find('span', class_='gs_gray').text + m["acknowledgement"] = desc.find("span", class_="gs_gray").text - publication['mandates'].append(m) + publication["mandates"].append(m) diff --git a/setup.py b/setup.py index de9012ce..0b951515 100644 --- a/setup.py +++ b/setup.py @@ -1,43 +1,44 @@ import setuptools -with open("README.md", "r") as fh: +with open("README.md") as fh: long_description = fh.read() setuptools.setup( - name='scholarly', - version='1.7.2', - author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi', - author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu', - description='Simple access to Google Scholar authors and citations', + name="scholarly", + version="1.7.2", + author="Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi", + author_email="steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu", + description="Simple access to Google Scholar authors and citations", long_description=long_description, long_description_content_type="text/markdown", - license='Unlicense', - - url='https://github.com/scholarly-python-package/scholarly', + license="Unlicense", + url="https://github.com/scholarly-python-package/scholarly", packages=setuptools.find_packages(), - keywords=['Google Scholar', 'academics', 'citations'], + keywords=["Google Scholar", "academics", "citations"], classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Topic :: Software Development :: Libraries :: Python Modules'], - install_requires=['arrow', - 'beautifulsoup4', - 'bibtexparser', - 'deprecated', - 'fake_useragent', - 'free-proxy', - 'python-dotenv', - 'requests[socks]', - 'selenium', - 'sphinx_rtd_theme', - 'typing_extensions' - ], + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + ], + install_requires=[ + "arrow", + "beautifulsoup4", + "bibtexparser", + "deprecated", + "fake_useragent", + "free-proxy", + "python-dotenv", + "requests[socks]", + "selenium", + "sphinx_rtd_theme", + "typing_extensions", + ], extras_require={ - 'tor': ['stem'], + "tor": ["stem"], }, - test_suite="test_module.py" + test_suite="test_module.py", ) diff --git a/test_module.py b/test_module.py index a626edda..9972e42d 100644 --- a/test_module.py +++ b/test_module.py @@ -1,15 +1,18 @@ -import unittest +import csv +import json import os -import sys -from scholarly import scholarly, ProxyGenerator -from scholarly.data_types import Mandate -from scholarly.publication_parser import PublicationParser import random -import json -import csv +import sys +import unittest +from contextlib import contextmanager + import requests from bs4 import BeautifulSoup -from contextlib import contextmanager + +from scholarly import ProxyGenerator, scholarly +from scholarly.data_types import Mandate +from scholarly.publication_parser import PublicationParser + try: import pandas as pd except ImportError: @@ -25,15 +28,17 @@ def test_luminati(self): Test that we can set up Luminati (Bright Data) successfully """ proxy_generator = ProxyGenerator() - success = proxy_generator.Luminati(usr=os.getenv("USERNAME"), - passwd=os.getenv("PASSWORD"), - proxy_port=os.getenv("PORT")) + success = proxy_generator.Luminati( + usr=os.getenv("USERNAME"), + passwd=os.getenv("PASSWORD"), + proxy_port=os.getenv("PORT"), + ) self.assertTrue(success) self.assertEqual(proxy_generator.proxy_mode, "LUMINATI") class TestScraperAPI(unittest.TestCase): - skipUnless = os.getenv('SCRAPER_API_KEY') + skipUnless = os.getenv("SCRAPER_API_KEY") @unittest.skipUnless(skipUnless, reason="No ScraperAPI key found") def test_scraperapi(self): @@ -41,25 +46,30 @@ def test_scraperapi(self): Test that we can set up ScraperAPI successfully """ proxy_generator = ProxyGenerator() - success = proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) + success = proxy_generator.ScraperAPI(os.getenv("SCRAPER_API_KEY")) self.assertTrue(success) self.assertEqual(proxy_generator.proxy_mode, "SCRAPERAPI") class TestTorInternal(unittest.TestCase): - skipUnless = [_bin for path in sys.path if os.path.isdir(path) for _bin in os.listdir(path) - if _bin in ('tor', 'tor.exe')] - - @unittest.skipUnless(skipUnless, reason='Tor executable not found') + skipUnless = [ + _bin + for path in sys.path + if os.path.isdir(path) + for _bin in os.listdir(path) + if _bin in ("tor", "tor.exe") + ] + + @unittest.skipUnless(skipUnless, reason="Tor executable not found") def test_tor_launch_own_process(self): """ Test that we can launch a Tor process """ proxy_generator = ProxyGenerator() if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): - tor_cmd = 'tor' + tor_cmd = "tor" elif sys.platform.startswith("win"): - tor_cmd = 'tor.exe' + tor_cmd = "tor.exe" else: tor_cmd = None @@ -72,14 +82,13 @@ def test_tor_launch_own_process(self): self.assertEqual(result["tor_control_port"], tor_control_port) self.assertEqual(result["tor_sock_port"], tor_sock_port) # Check that we can issue a query as well - query = 'Ipeirotis' + query = "Ipeirotis" scholarly.use_proxy(proxy_generator) authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) class TestScholarly(unittest.TestCase): - @classmethod def setUpClass(cls): """ @@ -113,23 +122,24 @@ def setUpClass(cls): else: tor_sock_port = None tor_control_port = None - proxy_generator.Tor_External(tor_sock_port, tor_control_port, - tor_password) + proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) elif cls.connection_method == "tor_internal": if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): - tor_cmd = 'tor' + tor_cmd = "tor" elif sys.platform.startswith("win"): - tor_cmd = 'tor.exe' + tor_cmd = "tor.exe" else: tor_cmd = None - proxy_generator.Tor_Internal(tor_cmd = tor_cmd) + proxy_generator.Tor_Internal(tor_cmd=tor_cmd) elif cls.connection_method == "luminati": scholarly.set_retries(10) - proxy_generator.Luminati(usr=os.getenv("USERNAME"), - passwd=os.getenv("PASSWORD"), - proxy_port=os.getenv("PORT")) + proxy_generator.Luminati( + usr=os.getenv("USERNAME"), + passwd=os.getenv("PASSWORD"), + proxy_port=os.getenv("PORT"), + ) elif cls.connection_method == "freeproxy": # Use different instances for primary and secondary @@ -137,7 +147,7 @@ def setUpClass(cls): proxy_generator.FreeProxies() elif cls.connection_method == "scraperapi": - proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) + proxy_generator.ScraperAPI(os.getenv("SCRAPER_API_KEY")) else: scholarly.use_proxy(None) @@ -149,7 +159,9 @@ def setUpClass(cls): # that does not exist yet, so we can safely delete it. cls.mandates_filename = "scholarly.csv" while os.path.exists(cls.mandates_filename): - cls.mandates_filename = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + ".csv" + cls.mandates_filename = ( + "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=10)) + ".csv" + ) @classmethod def tearDownClass(cls): @@ -174,10 +186,13 @@ def test_search_author_empty_author(self): """ Test that sholarly.search_author('') returns no authors """ - authors = [a for a in scholarly.search_author('')] + authors = [a for a in scholarly.search_author("")] self.assertIs(len(authors), 0) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_keyword_empty_keyword(self): """ As of 2020-04-30, there are 6 individuals that match the name 'label' @@ -185,18 +200,24 @@ def test_search_keyword_empty_keyword(self): # TODO this seems like undesirable functionality for # scholarly.search_keyword() with empty string. Surely, no authors # should be returned. Consider modifying the method itself. - authors = [a for a in scholarly.search_keyword('')] + authors = [a for a in scholarly.search_keyword("")] self.assertGreaterEqual(len(authors), 6) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs_empty_publication(self): """ Test that searching for an empty publication returns zero results """ - pubs = [p for p in scholarly.search_pubs('')] + pubs = [p for p in scholarly.search_pubs("")] self.assertIs(len(pubs), 0) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs_citedby(self): """ Testing that when we retrieve the list of publications that cite @@ -206,14 +227,17 @@ def test_search_pubs_citedby(self): The 'Machine-learned epidemiology' paper had 11 citations as of June 1, 2020. """ - query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale' + query = "Machine-learned epidemiology: real-time detection of foodborne illness at scale" pubs = [p for p in scholarly.search_pubs(query)] self.assertGreaterEqual(len(pubs), 1) filled = scholarly.fill(pubs[0]) cites = [c for c in scholarly.citedby(filled)] - self.assertEqual(len(cites), filled['num_citations']) + self.assertEqual(len(cites), filled["num_citations"]) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs_citedby_id(self): """ Test querying for citations by paper ID. @@ -233,11 +257,11 @@ def test_bibtex(self): Test that we get the BiBTeX entry correctly """ - expected_result = \ - ("""@inproceedings{ester1996density, + expected_result = ( + """@inproceedings{ester1996density, abstract = {Clustering algorithms are attractive for the task of class identification in spatial databases. """ - """However, the application to large spatial databases rises the following requirements for clustering algorithms: """ - """minimal requirements of domain knowledge to determine the input}, + """However, the application to large spatial databases rises the following requirements for clustering algorithms: """ + """minimal requirements of domain knowledge to determine the input}, author = {Ester, Martin and Kriegel, Hans-Peter and Sander, J{\\"o}rg and Xu, Xiaowei and others}, booktitle = {kdd}, number = {34}, @@ -250,12 +274,18 @@ def test_bibtex(self): """ ) - pub = scholarly.search_single_pub("A density-based algorithm for discovering clusters in large " - "spatial databases with noise", filled=True) + pub = scholarly.search_single_pub( + "A density-based algorithm for discovering clusters in large " + "spatial databases with noise", + filled=True, + ) result = scholarly.bibtex(pub) self.assertEqual(result, expected_result.replace("\n ", "\n")) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_keyword(self): """ Test that we can search based on specific keywords @@ -266,111 +296,135 @@ def test_search_keyword(self): should be listed first. """ # Example 1 - authors = [a['name'] for a in scholarly.search_keyword('3d_shape')] + authors = [a["name"] for a in scholarly.search_keyword("3d_shape")] self.assertIsNot(len(authors), 0) - self.assertIn(u'Steven A. Cholewiak, PhD', authors) + self.assertIn("Steven A. Cholewiak, PhD", authors) # Example 2 - expected_author = {'affiliation': 'Stanford University', - 'citedby': 43856, - 'email_domain': '@cs.stanford.edu', - 'filled': [], - 'interests': ['Robotics', - 'Haptics', - 'Human Motion Understanding'], - 'name': 'Oussama Khatib', - 'scholar_id': '4arkOLcAAAAJ', - 'source': 'SEARCH_AUTHOR_SNIPPETS', - 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4arkOLcAAAAJ' - } - search_query = scholarly.search_keyword('Haptics') + expected_author = { + "affiliation": "Stanford University", + "citedby": 43856, + "email_domain": "@cs.stanford.edu", + "filled": [], + "interests": ["Robotics", "Haptics", "Human Motion Understanding"], + "name": "Oussama Khatib", + "scholar_id": "4arkOLcAAAAJ", + "source": "SEARCH_AUTHOR_SNIPPETS", + "url_picture": "https://scholar.google.com/citations?view_op=medium_photo&user=4arkOLcAAAAJ", + } + search_query = scholarly.search_keyword("Haptics") author = next(search_query) for key in author: - if (key not in {"citedby", "container_type", "interests"}) and (key in expected_author): + if (key not in {"citedby", "container_type", "interests"}) and ( + key in expected_author + ): self.assertEqual(author[key], expected_author[key]) self.assertEqual(set(author["interests"]), set(expected_author["interests"])) def test_search_keywords(self): - query = scholarly.search_keywords(['crowdsourcing', 'privacy']) + query = scholarly.search_keywords(["crowdsourcing", "privacy"]) author = next(query) - self.assertEqual(author['scholar_id'], '_cMw1IUAAAAJ') - self.assertEqual(author['name'], 'Arpita Ghosh') - self.assertEqual(author['affiliation'], 'Cornell University') + self.assertEqual(author["scholar_id"], "_cMw1IUAAAAJ") + self.assertEqual(author["name"], "Arpita Ghosh") + self.assertEqual(author["affiliation"], "Cornell University") def test_search_author_single_author(self): - query = 'Steven A. Cholewiak' + query = "Steven A. Cholewiak" authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = scholarly.fill(authors[0]) - self.assertEqual(author['name'], u'Steven A. Cholewiak, PhD') - self.assertEqual(author['scholar_id'], u'4bahYMkAAAAJ') - - self.assertEqual(author['homepage'], "http://steven.cholewiak.com/") - self.assertEqual(author['organization'], 6518679690484165796) - self.assertGreaterEqual(author['public_access']['available'], 10) - self.assertEqual(author['public_access']['available'], - sum(pub.get('public_access', None) is True for pub in author['publications'])) - self.assertEqual(author['public_access']['not_available'], - sum(pub.get('public_access', None) is False for pub in author['publications'])) - pub = author['publications'][2] - self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC') - self.assertTrue('5738786554683183717' in pub['cites_id']) + self.assertEqual(author["name"], "Steven A. Cholewiak, PhD") + self.assertEqual(author["scholar_id"], "4bahYMkAAAAJ") + + self.assertEqual(author["homepage"], "http://steven.cholewiak.com/") + self.assertEqual(author["organization"], 6518679690484165796) + self.assertGreaterEqual(author["public_access"]["available"], 10) + self.assertEqual( + author["public_access"]["available"], + sum( + pub.get("public_access", None) is True for pub in author["publications"] + ), + ) + self.assertEqual( + author["public_access"]["not_available"], + sum( + pub.get("public_access", None) is False + for pub in author["publications"] + ), + ) + pub = author["publications"][2] + self.assertEqual(pub["author_pub_id"], "4bahYMkAAAAJ:LI9QrySNdTsC") + self.assertTrue("5738786554683183717" in pub["cites_id"]) scholarly.fill(pub) - mandate = Mandate(agency="US National Science Foundation", effective_date="2016/1", embargo="12 months", - url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - url_policy_cached="/mandates/nsf-2021-02-13.pdf", - acknowledgement=" …NSF grant BCS-1354029 …") - self.assertIn(mandate, pub['mandates']) + mandate = Mandate( + agency="US National Science Foundation", + effective_date="2016/1", + embargo="12 months", + url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", + url_policy_cached="/mandates/nsf-2021-02-13.pdf", + acknowledgement=" …NSF grant BCS-1354029 …", + ) + self.assertIn(mandate, pub["mandates"]) # Trigger the pprint method, but suppress the output with self.suppress_stdout(): scholarly.pprint(author) scholarly.pprint(pub) # Check for the complete list of coauthors - self.assertGreaterEqual(len(author['coauthors']), 20) - if len(author['coauthors']) > 20: - self.assertGreaterEqual(len(author['coauthors']), 36) - self.assertTrue('I23YUh8AAAAJ' in [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertGreaterEqual(len(author["coauthors"]), 20) + if len(author["coauthors"]) > 20: + self.assertGreaterEqual(len(author["coauthors"]), 36) + self.assertTrue( + "I23YUh8AAAAJ" + in [_coauth["scholar_id"] for _coauth in author["coauthors"]] + ) def test_search_author_multiple_authors(self): """ As of May 12, 2020 there are at least 24 'Cattanis's listed as authors and Giordano Cattani is one of them """ - authors = [a['name'] for a in scholarly.search_author('cattani')] + authors = [a["name"] for a in scholarly.search_author("cattani")] self.assertGreaterEqual(len(authors), 24) - self.assertIn(u'Giordano Cattani', authors) + self.assertIn("Giordano Cattani", authors) def test_search_author_id(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is - EmD_lTEAAAAJ and these IDs are permenant + EmD_lTEAAAAJ and these IDs are permanent """ - author = scholarly.search_author_id('EmD_lTEAAAAJ') - self.assertEqual(author['name'], u'Marie Skłodowska-Curie') - self.assertEqual(author['affiliation'], - u'Institut du radium, University of Paris') + author = scholarly.search_author_id("EmD_lTEAAAAJ") + self.assertEqual(author["name"], "Marie Skłodowska-Curie") + self.assertEqual( + author["affiliation"], "Institut du radium, University of Paris" + ) def test_search_author_id_filled(self): """ Test the search by author ID. Marie Skłodowska-Curie's ID is - EmD_lTEAAAAJ and these IDs are permenant. + EmD_lTEAAAAJ and these IDs are permanent. As of July 2020, Marie Skłodowska-Curie has 1963 citations on Google Scholar and 179 publications """ - author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) - self.assertEqual(author['name'], u'Marie Skłodowska-Curie') - self.assertEqual(author['affiliation'], - u'Institut du radium, University of Paris') - self.assertEqual(author['interests'], []) - self.assertEqual(author['public_access']['available'], 1) - self.assertEqual(author['public_access']['not_available'], 0) - self.assertGreaterEqual(author['citedby'], 1963) # TODO: maybe change - self.assertGreaterEqual(len(author['publications']), 179) - pub = author['publications'][1] - self.assertEqual(pub["citedby_url"], - "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702") - - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + author = scholarly.search_author_id("EmD_lTEAAAAJ", filled=True) + self.assertEqual(author["name"], "Marie Skłodowska-Curie") + self.assertEqual( + author["affiliation"], "Institut du radium, University of Paris" + ) + self.assertEqual(author["interests"], []) + self.assertEqual(author["public_access"]["available"], 1) + self.assertEqual(author["public_access"]["not_available"], 0) + self.assertGreaterEqual(author["citedby"], 1963) # TODO: maybe change + self.assertGreaterEqual(len(author["publications"]), 179) + pub = author["publications"][1] + self.assertEqual( + pub["citedby_url"], + "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702", + ) + + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs(self): """ As of May 12, 2020 there are at least 29 pubs that fit the search term: @@ -383,15 +437,21 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {"author_id", "pub_url", "num_citations"}: self.assertEqual(pub[key], pubs[0][key]) - for key in {'title', 'pub_year', 'venue'}: - self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) + for key in {"title", "pub_year", "venue"}: + self.assertEqual(pub["bib"][key], pubs[0]["bib"][key]) self.assertGreaterEqual(len(pubs), 27) - titles = [p['bib']['title'] for p in pubs] - self.assertIn('Visual perception of the physical stability of asymmetric three-dimensional objects', titles) + titles = [p["bib"]["title"] for p in pubs] + self.assertIn( + "Visual perception of the physical stability of asymmetric three-dimensional objects", + titles, + ) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs_total_results(self): """ As of September 16, 2021 there are 32 pubs that fit the search term: @@ -403,50 +463,66 @@ def test_search_pubs_total_results(self): pubs = scholarly.search_pubs('"naive physics" stability "3d shape"') self.assertGreaterEqual(pubs.total_results, 32) - pubs = scholarly.search_pubs('WIEN2k Blaha') + pubs = scholarly.search_pubs("WIEN2k Blaha") self.assertGreaterEqual(pubs.total_results, 10000) - pubs = scholarly.search_pubs('sdfsdf+24r+asdfasdf') + pubs = scholarly.search_pubs("sdfsdf+24r+asdfasdf") self.assertEqual(pubs.total_results, 0) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_search_pubs_filling_publication_contents(self): - ''' + """ This process checks the process of filling a publication that is derived from the search publication snippets. - ''' - query = 'Creating correct blur and its effect on accommodation' + """ + query = "Creating correct blur and its effect on accommodation" results = scholarly.search_pubs(query) pubs = [p for p in results] self.assertGreaterEqual(len(pubs), 1) f = scholarly.fill(pubs[0]) - self.assertTrue(f['bib']['author'] == u'Cholewiak, Steven A and Love, Gordon D and Banks, Martin S') - self.assertTrue(f['author_id'] == ['4bahYMkAAAAJ', '3xJXtlwAAAAJ', 'Smr99uEAAAAJ']) - self.assertTrue(f['bib']['journal'] == u'Journal of Vision') - self.assertTrue(f['bib']['number'] == '9') - self.assertTrue(f['bib']['pages'] == u'1--1') - self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') - self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') - self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') - self.assertTrue(f['bib']['volume'] == '18') - self.assertTrue(f['bib']['pub_year'] == u'2018') + self.assertTrue( + f["bib"]["author"] + == "Cholewiak, Steven A and Love, Gordon D and Banks, Martin S" + ) + self.assertTrue( + f["author_id"] == ["4bahYMkAAAAJ", "3xJXtlwAAAAJ", "Smr99uEAAAAJ"] + ) + self.assertTrue(f["bib"]["journal"] == "Journal of Vision") + self.assertTrue(f["bib"]["number"] == "9") + self.assertTrue(f["bib"]["pages"] == "1--1") + self.assertTrue( + f["bib"]["publisher"] + == "The Association for Research in Vision and Ophthalmology" + ) + self.assertTrue( + f["bib"]["title"] == "Creating correct blur and its effect on accommodation" + ) + self.assertTrue( + f["pub_url"] + == "https://jov.arvojournals.org/article.aspx?articleid=2701817" + ) + self.assertTrue(f["bib"]["volume"] == "18") + self.assertTrue(f["bib"]["pub_year"] == "2018") def test_extract_author_id_list(self): - ''' + """ This unit test tests the extraction of the author id field from the html to populate the `author_id` field in the Publication object. - ''' + """ author_html_full = 'SA Cholewiak, GD Love, MS Banks - Journal of vision, 2018 - jov.arvojournals.org' pub_parser = PublicationParser(None) author_id_list = pub_parser._get_author_id_list(author_html_full) - self.assertTrue(author_id_list[0] == '4bahYMkAAAAJ') - self.assertTrue(author_id_list[1] == '3xJXtlwAAAAJ') - self.assertTrue(author_id_list[2] == 'Smr99uEAAAAJ') + self.assertTrue(author_id_list[0] == "4bahYMkAAAAJ") + self.assertTrue(author_id_list[1] == "3xJXtlwAAAAJ") + self.assertTrue(author_id_list[2] == "Smr99uEAAAAJ") - author_html_partial = "A Bateman, J O'Connell, N Lorenzini, T Gardner… - BMC psychiatry, 2016 - Springer" + author_html_partial = 'A Bateman, J O\'Connell, N Lorenzini, T Gardner… - BMC psychiatry, 2016 - Springer' pub_parser = PublicationParser(None) author_id_list = pub_parser._get_author_id_list(author_html_partial) - self.assertTrue(author_id_list[3] == 'TEndP-sAAAAJ') + self.assertTrue(author_id_list[3] == "TEndP-sAAAAJ") def test_serialiazation(self): """ @@ -462,18 +538,20 @@ def test_serialiazation(self): def cpy_decoder(di): """A utility function to convert the keys in `cites_per_year` to `int` type. - This ensures consistency with `CitesPerYear` typing. + This ensures consistency with `CitesPerYear` typing. """ if "cites_per_year" in di: - di["cites_per_year"] = {int(k): v for k,v in di["cites_per_year"].items()} + di["cites_per_year"] = { + int(k): v for k, v in di["cites_per_year"].items() + } return di - author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) + author = scholarly.search_author_id("EmD_lTEAAAAJ", filled=True) serialized = json.dumps(author) author_loaded = json.loads(serialized, object_hook=cpy_decoder) self.assertEqual(author, author_loaded) # Test that a loaded publication is still fillable and serializable. - pub = author_loaded['publications'][0] + pub = author_loaded["publications"][0] scholarly.fill(pub) serialized = json.dumps(pub) pub_loaded = json.loads(serialized, object_hook=cpy_decoder) @@ -486,79 +564,106 @@ def test_full_title(self): publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY so the long title-publication is taken from an author object. """ - author = scholarly.search_author_id('Xxjj6IsAAAAJ') - author = scholarly.fill(author, sections=['publications']) + author = scholarly.search_author_id("Xxjj6IsAAAAJ") + author = scholarly.fill(author, sections=["publications"]) pub_index = -1 - for i in range(len(author['publications'])): - if author['publications'][i]['author_pub_id'] == 'Xxjj6IsAAAAJ:u_35RYKgDlwC': + for i in range(len(author["publications"])): + if ( + author["publications"][i]["author_pub_id"] + == "Xxjj6IsAAAAJ:u_35RYKgDlwC" + ): pub_index = i self.assertGreaterEqual(i, 0) # elided title - self.assertEqual(author['publications'][pub_index]['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + author["publications"][pub_index]["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …", + ) # full text - pub = scholarly.fill(author['publications'][pub_index]) - self.assertEqual(pub['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation') + pub = scholarly.fill(author["publications"][pub_index]) + self.assertEqual( + pub["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation", + ) - self.assertEqual(pub['bib']['citation'], "") + self.assertEqual(pub["bib"]["citation"], "") - for i in range(len(author['publications'])): - if author['publications'][i]['author_pub_id'] == 'Xxjj6IsAAAAJ:ldfaerwXgEUC': + for i in range(len(author["publications"])): + if ( + author["publications"][i]["author_pub_id"] + == "Xxjj6IsAAAAJ:ldfaerwXgEUC" + ): pub_index = i self.assertGreaterEqual(i, 0) # elided title - self.assertEqual(author['publications'][pub_index]['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + author["publications"][pub_index]["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …", + ) # full text - pub = scholarly.fill(author['publications'][pub_index]) - self.assertEqual(pub['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + pub = scholarly.fill(author["publications"][pub_index]) + self.assertEqual( + pub["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …", + ) - self.assertEqual(pub['bib']['citation'], "Journal of Fisheries and Life Sciences 5 (2), 74-84, 2020") + self.assertEqual( + pub["bib"]["citation"], + "Journal of Fisheries and Life Sciences 5 (2), 74-84, 2020", + ) def test_author_organization(self): - """ - """ + """ """ organization_id = 4836318610601440500 # Princeton University organizations = scholarly.search_org("Princeton University") self.assertEqual(len(organizations), 1) organization = organizations[0] - self.assertEqual(organization['Organization'], "Princeton University") - self.assertEqual(organization['id'], str(organization_id)) + self.assertEqual(organization["Organization"], "Princeton University") + self.assertEqual(organization["id"], str(organization_id)) search_query = scholarly.search_author_by_organization(organization_id) author = next(search_query) - self.assertEqual(author['scholar_id'], "ImhakoAAAAAJ") - self.assertEqual(author['name'], "Daniel Kahneman") - self.assertEqual(author['email_domain'], "@princeton.edu") - self.assertEqual(author['affiliation'], "Princeton University (Emeritus)") - self.assertGreaterEqual(author['citedby'], 438891) + self.assertEqual(author["scholar_id"], "ImhakoAAAAAJ") + self.assertEqual(author["name"], "Daniel Kahneman") + self.assertEqual(author["email_domain"], "@princeton.edu") + self.assertEqual(author["affiliation"], "Princeton University (Emeritus)") + self.assertGreaterEqual(author["citedby"], 438891) def test_coauthors(self): """ Test that we can fetch long (20+) and short list of coauthors """ - author = scholarly.search_author_id('7Jl3PIoAAAAJ') - scholarly.fill(author, sections=['basics', 'coauthors']) - self.assertEqual(author['name'], "Victor Silva") - self.assertLessEqual(len(author['coauthors']), 20) + author = scholarly.search_author_id("7Jl3PIoAAAAJ") + scholarly.fill(author, sections=["basics", "coauthors"]) + self.assertEqual(author["name"], "Victor Silva") + self.assertLessEqual(len(author["coauthors"]), 20) # If the above assertion fails, pick a different author profile - self.assertGreaterEqual(len(author['coauthors']), 6) - self.assertIn('Eleni Stroulia', [_coauth['name'] for _coauth in author['coauthors']]) - self.assertIn('TyM1dLwAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertGreaterEqual(len(author["coauthors"]), 6) + self.assertIn( + "Eleni Stroulia", [_coauth["name"] for _coauth in author["coauthors"]] + ) + self.assertIn( + "TyM1dLwAAAAJ", [_coauth["scholar_id"] for _coauth in author["coauthors"]] + ) # Fill co-authors - for _coauth in author['coauthors']: - scholarly.fill(_coauth, sections=['basics']) - self.assertIn(16627554827500071773, [_coauth.get('organization', None) for _coauth in author['coauthors']]) - - author = scholarly.search_author_id('PA9La6oAAAAJ') - scholarly.fill(author, sections=['basics', 'coauthors']) - self.assertEqual(author['name'], "Panos Ipeirotis") - self.assertGreaterEqual(len(author['coauthors']), 66) + for _coauth in author["coauthors"]: + scholarly.fill(_coauth, sections=["basics"]) + self.assertIn( + 16627554827500071773, + [_coauth.get("organization", None) for _coauth in author["coauthors"]], + ) + + author = scholarly.search_author_id("PA9La6oAAAAJ") + scholarly.fill(author, sections=["basics", "coauthors"]) + self.assertEqual(author["name"], "Panos Ipeirotis") + self.assertGreaterEqual(len(author["coauthors"]), 66) # Break the build if the long list cannot be fetched. - self.assertIn('Eduardo Ruiz', [_coauth['name'] for _coauth in author['coauthors']]) - self.assertIn('hWq7jFQAAAAJ', [_coauth['scholar_id'] for _coauth in author['coauthors']]) + self.assertIn( + "Eduardo Ruiz", [_coauth["name"] for _coauth in author["coauthors"]] + ) + self.assertIn( + "hWq7jFQAAAAJ", [_coauth["scholar_id"] for _coauth in author["coauthors"]] + ) def test_public_access(self): """ @@ -569,12 +674,21 @@ def test_public_access(self): public access counts without fetching publications. """ author = scholarly.search_author_id("f4KlrXIAAAAJ") - scholarly.fill(author, sections=['basics', 'public_access', 'publications']) + scholarly.fill(author, sections=["basics", "public_access", "publications"]) self.assertGreaterEqual(author["public_access"]["available"], 1150) - self.assertEqual(author["public_access"]["available"], - sum(pub.get("public_access", None) is True for pub in author["publications"])) - self.assertEqual(author["public_access"]["not_available"], - sum(pub.get("public_access", None) is False for pub in author["publications"])) + self.assertEqual( + author["public_access"]["available"], + sum( + pub.get("public_access", None) is True for pub in author["publications"] + ), + ) + self.assertEqual( + author["public_access"]["not_available"], + sum( + pub.get("public_access", None) is False + for pub in author["publications"] + ), + ) author = next(scholarly.search_author("Daniel Kahneman")) self.assertEqual(author["scholar_id"], "ImhakoAAAAAJ") @@ -587,63 +701,80 @@ def test_mandates(self): Test that we can fetch the funding information of a paper from an author """ author = scholarly.search_author_id("kUDCLXAAAAAJ") - scholarly.fill(author, sections=['public_access', 'publications']) - for pub in author['publications']: - if pub['author_pub_id'] == "kUDCLXAAAAAJ:tzM49s52ZIMC": + scholarly.fill(author, sections=["public_access", "publications"]) + for pub in author["publications"]: + if pub["author_pub_id"] == "kUDCLXAAAAAJ:tzM49s52ZIMC": scholarly.fill(pub) break - mandate = Mandate(agency="European Commission", effective_date="2013/12", embargo="6 months", grant="279396", - url_policy="https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf", - url_policy_cached="/mandates/horizon2020_eu-2021-02-13-en.pdf", + mandate = Mandate( + agency="European Commission", + effective_date="2013/12", + embargo="6 months", + grant="279396", + url_policy="https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf", + url_policy_cached="/mandates/horizon2020_eu-2021-02-13-en.pdf", ) - self.assertIn(mandate, pub['mandates']) + self.assertIn(mandate, pub["mandates"]) def test_related_articles_from_author(self): """ Test that we obtain related articles to an article from an author """ author = scholarly.search_author_id("ImhakoAAAAAJ") - scholarly.fill(author, sections=['basics', 'publications']) - pub = author['publications'][0] - self.assertEqual(pub['bib']['title'], 'Prospect theory: An analysis of decision under risk') - self.assertEqual(pub['bib']['citation'], 'Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013') + scholarly.fill(author, sections=["basics", "publications"]) + pub = author["publications"][0] + self.assertEqual( + pub["bib"]["title"], "Prospect theory: An analysis of decision under risk" + ) + self.assertEqual( + pub["bib"]["citation"], + "Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013", + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'pub_url', 'num_citations'}: + for key in {"pub_url", "num_citations"}: self.assertEqual(pub[key], same_article[key]) - for key in {'title', 'pub_year'}: - self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) + for key in {"title", "pub_year"}: + self.assertEqual(str(pub["bib"][key]), (same_article["bib"][key])) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Choices, values, and frames') - self.assertEqual(related_article['bib']['pub_year'], '2013') - self.assertGreaterEqual(related_article['num_citations'], 16561) - self.assertIn("A Tversky", related_article['bib']['author']) - - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + self.assertEqual(related_article["bib"]["title"], "Choices, values, and frames") + self.assertEqual(related_article["bib"]["pub_year"], "2013") + self.assertGreaterEqual(related_article["num_citations"], 16561) + self.assertIn("A Tversky", related_article["bib"]["author"]) + + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_related_articles_from_publication(self): """ Test that we obtain related articles to an article from a search """ - pub = scholarly.search_single_pub("Planck 2018 results-VI. Cosmological parameters") + pub = scholarly.search_single_pub( + "Planck 2018 results-VI. Cosmological parameters" + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {"author_id", "pub_url", "num_citations"}: self.assertEqual(pub[key], same_article[key]) - for key in {'title', 'pub_year'}: - self.assertEqual(pub['bib'][key], same_article['bib'][key]) + for key in {"title", "pub_year"}: + self.assertEqual(pub["bib"][key], same_article["bib"][key]) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Large Magellanic Cloud Cepheid standards provide ' - 'a 1% foundation for the determination of the Hubble constant and stronger evidence ' - 'for physics beyond ΛCDM') - self.assertEqual(related_article['bib']['pub_year'], '2019') - self.assertGreaterEqual(related_article['num_citations'], 1388) - self.assertIn("AG Riess", related_article['bib']['author']) + self.assertEqual( + related_article["bib"]["title"], + "Large Magellanic Cloud Cepheid standards provide " + "a 1% foundation for the determination of the Hubble constant and stronger evidence " + "for physics beyond ΛCDM", + ) + self.assertEqual(related_article["bib"]["pub_year"], "2019") + self.assertGreaterEqual(related_article["num_citations"], 1388) + self.assertIn("AG Riess", related_article["bib"]["author"]) def test_author_custom_url(self): """ @@ -651,23 +782,37 @@ def test_author_custom_url(self): """ query_url = "/citations?hl=en&view_op=search_authors&mauthors=label%3A3d_shape" authors = scholarly.search_author_custom_url(query_url) - self.assertIn(u'Steven A. Cholewiak, PhD', [author['name'] for author in authors]) + self.assertIn( + "Steven A. Cholewiak, PhD", [author["name"] for author in authors] + ) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_pubs_custom_url(self): """ Test that we can use custom URLs for retrieving publication data """ - query_url = ('/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' - 'as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31') + query_url = ( + '/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' + "as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31" + ) pubs = scholarly.search_pubs_custom_url(query_url) pub = next(pubs) - self.assertEqual(pub['bib']['title'], 'Quantitation and mapping of tissue optical properties using modulated imaging') - self.assertEqual(set(pub['author_id']), {'V-ab9U4AAAAJ', '4k-k6SEAAAAJ', 'GLm-SaQAAAAJ'}) - self.assertEqual(pub['bib']['pub_year'], '2009') - self.assertGreaterEqual(pub['num_citations'], 581) + self.assertEqual( + pub["bib"]["title"], + "Quantitation and mapping of tissue optical properties using modulated imaging", + ) + self.assertEqual( + set(pub["author_id"]), {"V-ab9U4AAAAJ", "4k-k6SEAAAAJ", "GLm-SaQAAAAJ"} + ) + self.assertEqual(pub["bib"]["pub_year"], "2009") + self.assertGreaterEqual(pub["num_citations"], 581) - @unittest.skipIf(sys.platform.startswith("win"), reason="File read is empty in Windows") + @unittest.skipIf( + sys.platform.startswith("win"), reason="File read is empty in Windows" + ) def test_download_mandates_csv(self): """ Test that we can download the mandates CSV and read it. @@ -676,40 +821,50 @@ def test_download_mandates_csv(self): text = scholarly.download_mandates_csv(self.mandates_filename) self.assertGreater(len(text), 0) funder, policy, percentage2020, percentageOverall = [], [], [], [] - with open(self.mandates_filename, "r") as f: + with open(self.mandates_filename) as f: csv_reader = csv.DictReader(f) for row in csv_reader: - funder.append(row['\ufeffFunder']) - policy.append(row['Policy']) - percentage2020.append(row['2020']) - percentageOverall.append(row['Overall']) + funder.append(row["\ufeffFunder"]) + policy.append(row["Policy"]) + percentage2020.append(row["2020"]) + percentageOverall.append(row["Overall"]) agency_policy = { "US National Science Foundation": "https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", "Department of Science & Technology, India": "http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf", "Swedish Research Council": "https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "" + "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "", } agency_2020 = { "US National Science Foundation": "87%", "Department of Science & Technology, India": "49%", "Swedish Research Council": "89%", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "88%" + "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning": "88%", } - response = requests.get("https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en") + response = requests.get( + "https://scholar.google.com/citations?view_op=mandates_leaderboard&hl=en" + ) soup = BeautifulSoup(response.text, "html.parser") agency_overall = soup.find_all("td", class_="gsc_mlt_n gsc_mlt_bd") - for agency, index in zip(agency_policy, [4-1,10-1, 19-1, 64-1]): + for agency, index in zip(agency_policy, [4 - 1, 10 - 1, 19 - 1, 64 - 1]): agency_index = funder.index(agency) self.assertEqual(policy[agency_index], agency_policy[agency]) # Check that the percentage values from CSV and on the page agree. - self.assertEqual(percentageOverall[agency_index], agency_overall[index].text) + self.assertEqual( + percentageOverall[agency_index], agency_overall[index].text + ) # The percentage fluctuates, so we can't check the exact value. - self.assertAlmostEqual(int(percentage2020[agency_index][:-1]), int(agency_2020[agency][:-1]), delta=2) - - @unittest.skipIf(sys.platform.startswith("win"), reason="File read is empty in Windows") + self.assertAlmostEqual( + int(percentage2020[agency_index][:-1]), + int(agency_2020[agency][:-1]), + delta=2, + ) + + @unittest.skipIf( + sys.platform.startswith("win"), reason="File read is empty in Windows" + ) @unittest.skipIf(pd is None, reason="pandas is not installed") def test_download_mandates_csv_with_pandas(self): """ @@ -718,20 +873,24 @@ def test_download_mandates_csv_with_pandas(self): if not os.path.exists(self.mandates_filename): text = scholarly.download_mandates_csv(self.mandates_filename) self.assertGreater(len(text), 0) - df = pd.read_csv(self.mandates_filename, usecols=["Funder", "Policy", "2020", "Overall"]).fillna("") + df = pd.read_csv( + self.mandates_filename, usecols=["Funder", "Policy", "2020", "Overall"] + ).fillna("") self.assertGreater(len(df), 0) - funders = ["US National Science Foundation", - "Department of Science & Technology, India", - "Swedish Research Council", - "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning" - ] - - policies = ["https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - "http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf", - "https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html", - "" - ] + funders = [ + "US National Science Foundation", + "Department of Science & Technology, India", + "Swedish Research Council", + "Swedish Research Council for Environment, Agricultural Sciences and Spatial Planning", + ] + + policies = [ + "https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", + "http://www.dst.gov.in/sites/default/files/APPROVED%20OPEN%20ACCESS%20POLICY-DBT%26DST%2812.12.2014%29_1.pdf", + "https://www.vr.se/english/applying-for-funding/requirements-terms-and-conditions/publishing-open-access.html", + "", + ] percentage_overall = [84, 54, 83, 83] percentage_2020 = [87, 49, 89, 88] @@ -741,8 +900,12 @@ def test_download_mandates_csv_with_pandas(self): df_overall = df["Overall"][rows].tolist() df_2020 = df["2020"][rows].tolist() for idx in range(4): - self.assertAlmostEqual(int(df_overall[idx][:-1]), percentage_overall[idx], delta=2) - self.assertAlmostEqual(int(df_2020[idx][:-1]), percentage_2020[idx], delta=2) + self.assertAlmostEqual( + int(df_overall[idx][:-1]), percentage_overall[idx], delta=2 + ) + self.assertAlmostEqual( + int(df_2020[idx][:-1]), percentage_2020[idx], delta=2 + ) def test_save_journal_leaderboard(self): """ @@ -750,23 +913,32 @@ def test_save_journal_leaderboard(self): """ filename = "journals.csv" while os.path.exists(filename): - filename = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=10)) + ".csv" + filename = ( + "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=10)) + ".csv" + ) try: - scholarly.save_journals_csv(category="Physics & Mathematics", subcategory="Astronomy & Astrophysics", - filename=filename, include_comments=True) - with open(filename, "r") as f: + scholarly.save_journals_csv( + category="Physics & Mathematics", + subcategory="Astronomy & Astrophysics", + filename=filename, + include_comments=True, + ) + with open(filename) as f: csv_reader = csv.DictReader(f) for row in csv_reader: - self.assertEqual(row['Publication'], 'The Astrophysical Journal') - self.assertEqual(row['h5-index'], '161') - self.assertEqual(row['h5-median'], '239') - self.assertEqual(row['Comment'], '#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ') + self.assertEqual(row["Publication"], "The Astrophysical Journal") + self.assertEqual(row["h5-index"], "161") + self.assertEqual(row["h5-median"], "239") + self.assertEqual( + row["Comment"], + "#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ", + ) break finally: if os.path.exists(filename): os.remove(filename) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 98fdfd24b63f4edab12bd73087ead5443aa721a3 Mon Sep 17 00:00:00 2001 From: snow-fox Date: Tue, 18 Oct 2022 12:48:41 +0100 Subject: [PATCH 2/5] update contributing docs --- .github/CONTRIBUTING.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 9f80eb12..86213724 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -21,7 +21,8 @@ Additionally, if you are interesting in contributing to the codebase, submit a p 3. Make sure the unit tests pass before raising a PR. For all the unit tests to pass, you typically need to setup a premium proxy service such as `ScraperAPI` or `Luminati` (`Bright Data`). If you do not have an account, you may try to use `FreeProxy`. Without a proxy, 6 out of 17 test cases will be skipped. 4. Check that the documentatation is consistent with the code. Check that the documentation builds successfully. 5. Submit a PR, with `develop` as your base branch. -6. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. +6. Run `pre-commit run --all-files` and ensure that all tests pass. +7. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. ## Build Docs From 3b32069b049dcec132c16e6c542b0a0cebee6d09 Mon Sep 17 00:00:00 2001 From: snow-fox Date: Tue, 18 Oct 2022 14:51:48 +0100 Subject: [PATCH 3/5] remove rogue ignore --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0910eef8..e172e648 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,6 @@ repos: hooks: - id: flake8 args: - - '--per-file-ignores=*/__init__.py:F401 test/all_parameter_combs_test.py:F405 pettingzoo/classic/go/go.py:W605' - --extend-ignore=E203 - --max-complexity=205 - --max-line-length=300 From 5cfead306ff292c710edaa932451575212f58717 Mon Sep 17 00:00:00 2001 From: Jet Date: Mon, 31 Oct 2022 14:22:59 +0000 Subject: [PATCH 4/5] revert change --- .github/CONTRIBUTING.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 86213724..9f80eb12 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -21,8 +21,7 @@ Additionally, if you are interesting in contributing to the codebase, submit a p 3. Make sure the unit tests pass before raising a PR. For all the unit tests to pass, you typically need to setup a premium proxy service such as `ScraperAPI` or `Luminati` (`Bright Data`). If you do not have an account, you may try to use `FreeProxy`. Without a proxy, 6 out of 17 test cases will be skipped. 4. Check that the documentatation is consistent with the code. Check that the documentation builds successfully. 5. Submit a PR, with `develop` as your base branch. -6. Run `pre-commit run --all-files` and ensure that all tests pass. -7. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. +6. After an initial code review by the maintainers, the unit tests will be run with the `ScraperAPI` key stored in the Github repository. Passing all tests cases is necessary before merging your PR. ## Build Docs From e3efc107f5be16efebc781d6df0daca2e009cfb5 Mon Sep 17 00:00:00 2001 From: snow-fox Date: Wed, 2 Nov 2022 12:23:25 +0000 Subject: [PATCH 5/5] fix pre-commits --- scholarly/_scholarly.py | 101 ++++++--- scholarly/publication_parser.py | 21 +- setup.py | 10 +- test_module.py | 365 +++++++++++++++++++------------- 4 files changed, 307 insertions(+), 190 deletions(-) diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py index 88f22430..b1fe1721 100644 --- a/scholarly/_scholarly.py +++ b/scholarly/_scholarly.py @@ -1,26 +1,35 @@ -"""scholarly.py""" -import requests -import re -import os +"""Core Scholarly Class.""" + import copy import csv +import datetime import os import pprint -import datetime import re from typing import Dict, List, Union + +import requests +from dotenv import find_dotenv, load_dotenv + from ._navigator import Navigator from ._proxy_generator import ProxyGenerator from .author_parser import AuthorParser +from .data_types import ( + Author, + AuthorSource, + CitesPerYear, + Journal, + Publication, + PublicationSource, +) from .publication_parser import PublicationParser, _SearchScholarIterator -from .data_types import Author, AuthorSource, CitesPerYear, Journal, Publication, PublicationSource -_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}' -_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}' -_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}' +_AUTHSEARCH = "/citations?hl=en&view_op=search_authors&mauthors={0}" +_KEYWORDSEARCH = "/citations?hl=en&view_op=search_authors&mauthors=label:{0}" +_KEYWORDSEARCHBASE = "/citations?hl=en&view_op=search_authors&mauthors={}" _KEYWORDSEARCH_PATTERN = "[-: #(),;]+" # Unallowed characters in the keywords. -_PUBSEARCH = '/scholar?hl=en&q={0}' -_CITEDBYSEARCH = '/scholar?hl=en&cites={0}' +_PUBSEARCH = "/scholar?hl=en&q={0}" +_CITEDBYSEARCH = "/scholar?hl=en&cites={0}" _ORGSEARCH = "/citations?view_op=view_org&hl=en&org={0}" _MANDATES_URL = ( "https://scholar.google.com/citations?view_op=mandates_leaderboard_csv&hl=en" @@ -291,53 +300,76 @@ def _bin_citations_by_year(cites_per_year: CitesPerYear, year_end): return years - def citedby(self, object: Publication)->_SearchScholarIterator: - """Searches Google Scholar for other articles that cite this Publication - and returns a Publication generator. + def citedby(self, object: Publication) -> _SearchScholarIterator: + """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. :param object: The Publication object for the bibtex exportation :type object: Publication """ - - if object['container_type'] != "Publication": + if object["container_type"] != "Publication": self.logger.warning("Object not supported for bibtex exportation") return if object["num_citations"] <= 1000: return PublicationParser(self.__nav).citedby(object) - self.logger.debug("Since the paper titled %s has %d citations (>1000), " - "fetching it on an annual basis.", object["bib"]["title"], object["num_citations"]) + self.logger.debug( + "Since the paper titled %s has %d citations (>1000), " + "fetching it on an annual basis.", + object["bib"]["title"], + object["num_citations"], + ) year_end = int(datetime.date.today().year) if object["source"] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: self.fill(object) - years = self._bin_citations_by_year(object.get("cites_per_year", {}), year_end) + years = self._bin_citations_by_year( + object.get("cites_per_year", {}), year_end + ) else: try: year_low = int(object["bib"]["pub_year"]) except KeyError: - self.logger.warning("Unknown publication year for paper %s, may result in incorrect number " - "of citedby papers.", object["bib"]["title"]) + self.logger.warning( + "Unknown publication year for paper %s, may result in incorrect number " + "of citedby papers.", + object["bib"]["title"], + ) return PublicationParser(self.__nav).citedby(object) # Go one year at a time in decreasing order - years = zip(range(year_end, year_low-1, -1), range(year_end, year_low-1, -1)) + years = zip( + range(year_end, year_low - 1, -1), range(year_end, year_low - 1, -1) + ) # Extract cites_id. Note: There could be multiple ones, separated by commas. - m = re.search("cites=[\d+,]*", object["citedby_url"]) + m = re.search(r"cites=[\d+,]*", object["citedby_url"]) pub_id = m.group()[6:] for y_hi, y_lo in years: - sub_citations = self.search_citedby(publication_id=pub_id, year_low=y_lo, year_high=y_hi) + sub_citations = self.search_citedby( + publication_id=pub_id, year_low=y_lo, year_high=y_hi + ) if sub_citations.total_results and (sub_citations.total_results > 1000): - self.logger.warn("The paper titled %s has %d citations in the year %d. " - "Due to the limitation in Google Scholar, fetching only 1000 results " - "from that year.", object["bib"]["title"], sub_citations.total_results, y_lo) + self.logger.warn( + "The paper titled %s has %d citations in the year %d. " + "Due to the limitation in Google Scholar, fetching only 1000 results " + "from that year.", + object["bib"]["title"], + sub_citations.total_results, + y_lo, + ) yield from sub_citations - def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author: - """Search by author id and return a single Author object + def search_author_id( + self, + id: str, + filled: bool = False, + sortby: str = "citedby", + publication_limit: int = 0, + ) -> Author: + """Search by author id and return a single Author object. + :param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. :type sortby: string :param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit. @@ -395,7 +427,6 @@ def search_keyword(self, keyword: str): 'source': 'SEARCH_AUTHOR_SNIPPETS', 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=lHrs3Y4AAAAJ'} """ - reg_keyword = re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) url = _KEYWORDSEARCH.format(requests.utils.quote(reg_keyword)) return self.__nav.search_authors(url) @@ -431,9 +462,13 @@ def search_keywords(self, keywords: List[str]): 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'} """ - reg_keywords = (re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords) - formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in reg_keywords] - formated_keywords = '+'.join(formated_keywords) + reg_keywords = ( + re.sub(_KEYWORDSEARCH_PATTERN, "_", keyword) for keyword in keywords + ) + formated_keywords = [ + "label:" + requests.utils.quote(keyword) for keyword in reg_keywords + ] + formated_keywords = "+".join(formated_keywords) url = _KEYWORDSEARCHBASE.format(formated_keywords) return self.__nav.search_authors(url) diff --git a/scholarly/publication_parser.py b/scholarly/publication_parser.py index b635fec6..74b6372e 100644 --- a/scholarly/publication_parser.py +++ b/scholarly/publication_parser.py @@ -364,15 +364,18 @@ def fill(self, publication: Publication) -> Publication: [description_part for description_part in val] ) - publication['bib']['abstract'] = result - elif key == 'total citations': - publication['cites_id'] = re.findall( - _SCHOLARPUBRE, val.a['href'])[0].split(',') - publication['citedby_url'] = _CITEDBYLINK.format(','.join(publication['cites_id'])) - elif key == 'scholar articles': - for entry in val.find_all('a'): - if entry.text.lower() == 'related articles': - publication['url_related_articles'] = entry.get('href')[26:] + publication["bib"]["abstract"] = result + elif key == "total citations": + publication["cites_id"] = re.findall(_SCHOLARPUBRE, val.a["href"])[ + 0 + ].split(",") + publication["citedby_url"] = _CITEDBYLINK.format( + ",".join(publication["cites_id"]) + ) + elif key == "scholar articles": + for entry in val.find_all("a"): + if entry.text.lower() == "related articles": + publication["url_related_articles"] = entry.get("href")[26:] break # number of citation per year years = [int(y.text) for y in soup.find_all(class_="gsc_oci_g_t")] diff --git a/setup.py b/setup.py index 531b8780..3afa0ce9 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,11 @@ long_description = fh.read() setuptools.setup( - name='scholarly', - version='1.7.3', - author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi', - author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu', - description='Simple access to Google Scholar authors and citations', + name="scholarly", + version="1.7.3", + author="Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi", + author_email="steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu", + description="Simple access to Google Scholar authors and citations", long_description=long_description, long_description_content_type="text/markdown", license="Unlicense", diff --git a/test_module.py b/test_module.py index 866dde78..5f18bc63 100644 --- a/test_module.py +++ b/test_module.py @@ -1,18 +1,10 @@ -import unittest -import os -import sys -from collections import Counter -from scholarly import scholarly, ProxyGenerator -from scholarly.data_types import Mandate -from scholarly.publication_parser import PublicationParser -import random -import json import csv import json import os import random import sys import unittest +from collections import Counter from contextlib import contextmanager import requests @@ -143,11 +135,11 @@ def test_search_author_empty_author(self): self.assertIs(len(authors), 0) def test_search_keywords(self): - query = scholarly.search_keywords(['crowdsourcing', 'privacy']) + query = scholarly.search_keywords(["crowdsourcing", "privacy"]) author = next(query) - self.assertEqual(author['scholar_id'], '_cMw1IUAAAAJ') - self.assertEqual(author['name'], 'Arpita Ghosh') - self.assertEqual(author['affiliation'], 'Cornell University') + self.assertEqual(author["scholar_id"], "_cMw1IUAAAAJ") + self.assertEqual(author["name"], "Arpita Ghosh") + self.assertEqual(author["affiliation"], "Cornell University") def test_search_keyword_empty_keyword(self): """ @@ -169,7 +161,7 @@ def test_search_keyword(self): should be listed first. """ # Example 1 - authors = [a['name'] for a in scholarly.search_keyword('3d shape')] + authors = [a["name"] for a in scholarly.search_keyword("3d shape")] self.assertIsNot(len(authors), 0) self.assertIn("Steven A. Cholewiak, PhD", authors) @@ -195,24 +187,29 @@ def test_search_keyword(self): self.assertEqual(set(author["interests"]), set(expected_author["interests"])) # Example 3 - expected_author = {'affiliation': "CEA, Département d'Astrophysique", - 'citedby': 98936, - 'email_domain': '@cea.fr', - 'filled': [], - 'interests': ['Cosmology (CMB', - 'weak-lensing', - 'large scale structure)', - 'Statistics', - 'Image Processing'], - 'name': 'Jean-Luc Starck', - 'scholar_id': 'IAaAiXgAAAAJ', - 'source': 'SEARCH_AUTHOR_SNIPPETS', - 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=IAaAiXgAAAAJ' - } - search_query = scholarly.search_keyword('large-scale structure') + expected_author = { + "affiliation": "CEA, Département d'Astrophysique", + "citedby": 98936, + "email_domain": "@cea.fr", + "filled": [], + "interests": [ + "Cosmology (CMB", + "weak-lensing", + "large scale structure)", + "Statistics", + "Image Processing", + ], + "name": "Jean-Luc Starck", + "scholar_id": "IAaAiXgAAAAJ", + "source": "SEARCH_AUTHOR_SNIPPETS", + "url_picture": "https://scholar.google.com/citations?view_op=medium_photo&user=IAaAiXgAAAAJ", + } + search_query = scholarly.search_keyword("large-scale structure") author = next(search_query) for key in author: - if (key not in {"citedby", "container_type", "interests"}) and (key in expected_author): + if (key not in {"citedby", "container_type", "interests"}) and ( + key in expected_author + ): self.assertEqual(author[key], expected_author[key]) scholarly.pprint(author) self.assertEqual(set(author["interests"]), set(expected_author["interests"])) @@ -245,11 +242,15 @@ def test_search_author_single_author(self): self.assertEqual(pub["author_pub_id"], "4bahYMkAAAAJ:LI9QrySNdTsC") self.assertTrue("5738786554683183717" in pub["cites_id"]) scholarly.fill(pub) - mandate = Mandate(agency="US National Science Foundation", effective_date="2016/1", embargo="12 months", - url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", - url_policy_cached="/mandates/nsf-2021-02-13.pdf", - grant="BCS-1354029") - self.assertIn(mandate, pub['mandates']) + mandate = Mandate( + agency="US National Science Foundation", + effective_date="2016/1", + embargo="12 months", + url_policy="https://www.nsf.gov/pubs/2015/nsf15052/nsf15052.pdf", + url_policy_cached="/mandates/nsf-2021-02-13.pdf", + grant="BCS-1354029", + ) + self.assertIn(mandate, pub["mandates"]) # Trigger the pprint method, but suppress the output with self.suppress_stdout(): scholarly.pprint(author) @@ -290,18 +291,21 @@ def test_search_author_id_filled(self): As of July 2020, Marie Skłodowska-Curie has 1963 citations on Google Scholar and 179 publications """ - author = scholarly.search_author_id('EmD_lTEAAAAJ', filled=True) - self.assertEqual(author['name'], u'Marie Skłodowska-Curie') - self.assertEqual(author['affiliation'], - u'Institut du radium, University of Paris') - self.assertEqual(author['interests'], []) - self.assertEqual(author['public_access']['available'], 0) - self.assertEqual(author['public_access']['not_available'], 0) - self.assertGreaterEqual(author['citedby'], 2067) # TODO: maybe change - self.assertGreaterEqual(len(author['publications']), 218) - pub = author['publications'][1] - self.assertEqual(pub["citedby_url"], - "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702") + author = scholarly.search_author_id("EmD_lTEAAAAJ", filled=True) + self.assertEqual(author["name"], "Marie Skłodowska-Curie") + self.assertEqual( + author["affiliation"], "Institut du radium, University of Paris" + ) + self.assertEqual(author["interests"], []) + self.assertEqual(author["public_access"]["available"], 0) + self.assertEqual(author["public_access"]["not_available"], 0) + self.assertGreaterEqual(author["citedby"], 2067) # TODO: maybe change + self.assertGreaterEqual(len(author["publications"]), 218) + pub = author["publications"][1] + self.assertEqual( + pub["citedby_url"], + "https://scholar.google.com/scholar?oi=bibs&hl=en&cites=9976400141451962702", + ) def test_extract_author_id_list(self): """ @@ -365,19 +369,26 @@ def test_full_title(self): pub_index = -1 # Skip this part of the test since u_35RYKgDlwC has vanished from Google Scholar if False: - for i in range(len(author['publications'])): - if author['publications'][i]['author_pub_id'] == 'Xxjj6IsAAAAJ:u_35RYKgDlwC': + for i in range(len(author["publications"])): + if ( + author["publications"][i]["author_pub_id"] + == "Xxjj6IsAAAAJ:u_35RYKgDlwC" + ): pub_index = i self.assertGreaterEqual(i, 0) # elided title - self.assertEqual(author['publications'][pub_index]['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …') + self.assertEqual( + author["publications"][pub_index]["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary …", + ) # full text - pub = scholarly.fill(author['publications'][pub_index]) - self.assertEqual(pub['bib']['title'], - u'Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation') + pub = scholarly.fill(author["publications"][pub_index]) + self.assertEqual( + pub["bib"]["title"], + "Evaluation of toxicity of Dichlorvos (Nuvan) to fresh water fish Anabas testudineus and possible modulation by crude aqueous extract of Andrographis paniculata: A preliminary investigation", + ) - self.assertEqual(pub['bib']['citation'], "") + self.assertEqual(pub["bib"]["citation"], "") for i in range(len(author["publications"])): if ( @@ -498,9 +509,13 @@ def test_mandates(self): scholarly.fill(pub) break # The hard-coded reference mandate may need regular updates. - mandate = Mandate(agency="European Commission", effective_date="2013/12", embargo="6 months", grant="647112", - url_policy="https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf", - url_policy_cached="/mandates/horizon2020_eu-2021-02-13-en.pdf", + mandate = Mandate( + agency="European Commission", + effective_date="2013/12", + embargo="6 months", + grant="647112", + url_policy="https://erc.europa.eu/sites/default/files/document/file/ERC%20Open%20Access%20guidelines-Version%201.1._10.04.2017.pdf", + url_policy_cached="/mandates/horizon2020_eu-2021-02-13-en.pdf", ) self.assertIn(mandate, pub["mandates"]) @@ -514,7 +529,9 @@ def test_author_custom_url(self): "Steven A. Cholewiak, PhD", [author["name"] for author in authors] ) - @unittest.skipIf(sys.platform.startswith("win"), reason="File read is empty in Windows") + @unittest.skipIf( + sys.platform.startswith("win"), reason="File read is empty in Windows" + ) def test_download_mandates_csv(self): """ Test that we can download the mandates CSV and read it. @@ -551,7 +568,7 @@ def test_download_mandates_csv(self): agency_overall = soup.find_all("td", class_="gsc_mlt_n gsc_mlt_bd") # These hardcoded numbers need some regular updates. - for agency, index in zip(agency_policy, [5-1,9-1, 21-1, 63-1]): + for agency, index in zip(agency_policy, [5 - 1, 9 - 1, 21 - 1, 63 - 1]): agency_index = funder.index(agency) self.assertEqual(policy[agency_index], agency_policy[agency]) # Check that the percentage values from CSV and on the page agree. @@ -631,10 +648,13 @@ def test_save_journal_leaderboard(self): csv_reader = csv.DictReader(f) for row in csv_reader: # These hard-coded values need regular updates. - self.assertEqual(row['Publication'], 'The Astrophysical Journal') - self.assertEqual(row['h5-index'], '167') - self.assertEqual(row['h5-median'], '234') - self.assertEqual(row['Comment'], '#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ') + self.assertEqual(row["Publication"], "The Astrophysical Journal") + self.assertEqual(row["h5-index"], "167") + self.assertEqual(row["h5-median"], "234") + self.assertEqual( + row["Comment"], + "#1 Astronomy & Astrophysics; #2 Physics & Mathematics; ", + ) break finally: if os.path.exists(filename): @@ -642,13 +662,25 @@ def test_save_journal_leaderboard(self): def test_bin_citations_by_year(self): """Test an internal optimization function to bin cites_per_year - while keeping the citation counts less than 1000 per bin. - """ - cpy = {2022: 490, 2021: 340, 2020:327, 2019:298, 2018: 115, 2017: 49, 2016: 20, 2015: 8, 2014: 3, 2013: 1, 2012: 1} + while keeping the citation counts less than 1000 per bin. + """ + cpy = { + 2022: 490, + 2021: 340, + 2020: 327, + 2019: 298, + 2018: 115, + 2017: 49, + 2016: 20, + 2015: 8, + 2014: 3, + 2013: 1, + 2012: 1, + } years = scholarly._bin_citations_by_year(cpy, 2022) for y_hi, y_lo in years: self.assertLessEqual(y_lo, y_hi) - self.assertLessEqual(sum(cpy[y] for y in range(y_lo, y_hi+1)), 1000) + self.assertLessEqual(sum(cpy[y] for y in range(y_lo, y_hi + 1)), 1000) class TestScholarlyWithProxy(unittest.TestCase): @@ -685,23 +717,24 @@ def setUpClass(cls): else: tor_sock_port = None tor_control_port = None - proxy_generator.Tor_External(tor_sock_port, tor_control_port, - tor_password) + proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) elif cls.connection_method == "tor_internal": if sys.platform.startswith("linux") or sys.platform.startswith("darwin"): - tor_cmd = 'tor' + tor_cmd = "tor" elif sys.platform.startswith("win"): - tor_cmd = 'tor.exe' + tor_cmd = "tor.exe" else: tor_cmd = None - proxy_generator.Tor_Internal(tor_cmd = tor_cmd) + proxy_generator.Tor_Internal(tor_cmd=tor_cmd) elif cls.connection_method == "luminati": scholarly.set_retries(10) - proxy_generator.Luminati(usr=os.getenv("USERNAME"), - passwd=os.getenv("PASSWORD"), - proxy_port=os.getenv("PORT")) + proxy_generator.Luminati( + usr=os.getenv("USERNAME"), + passwd=os.getenv("PASSWORD"), + proxy_port=os.getenv("PORT"), + ) elif cls.connection_method == "freeproxy": # Use different instances for primary and secondary @@ -709,7 +742,7 @@ def setUpClass(cls): proxy_generator.FreeProxies() elif cls.connection_method == "scraperapi": - proxy_generator.ScraperAPI(os.getenv('SCRAPER_API_KEY')) + proxy_generator.ScraperAPI(os.getenv("SCRAPER_API_KEY")) else: scholarly.use_proxy(None) @@ -720,7 +753,7 @@ def test_search_pubs_empty_publication(self): """ Test that searching for an empty publication returns zero results """ - pubs = [p for p in scholarly.search_pubs('')] + pubs = [p for p in scholarly.search_pubs("")] self.assertIs(len(pubs), 0) def test_search_pubs_citedby(self): @@ -732,12 +765,12 @@ def test_search_pubs_citedby(self): The 'Machine-learned epidemiology' paper had 11 citations as of June 1, 2020. """ - query = 'Machine-learned epidemiology: real-time detection of foodborne illness at scale' + query = "Machine-learned epidemiology: real-time detection of foodborne illness at scale" pubs = [p for p in scholarly.search_pubs(query)] self.assertGreaterEqual(len(pubs), 1) filled = scholarly.fill(pubs[0]) cites = [c for c in scholarly.citedby(filled)] - self.assertEqual(len(cites), filled['num_citations']) + self.assertEqual(len(cites), filled["num_citations"]) def test_search_pubs_citedby_id(self): """ @@ -758,11 +791,11 @@ def test_bibtex(self): Test that we get the BiBTeX entry correctly """ - expected_result = \ - ("""@inproceedings{ester1996density, + expected_result = ( + """@inproceedings{ester1996density, abstract = {Clustering algorithms are attractive for the task of class identification in spatial databases. """ - """However, the application to large spatial databases rises the following requirements for clustering algorithms: """ - """minimal requirements of domain knowledge to determine the input}, + """However, the application to large spatial databases rises the following requirements for clustering algorithms: """ + """minimal requirements of domain knowledge to determine the input}, author = {Ester, Martin and Kriegel, Hans-Peter and Sander, J{\\"o}rg and Xu, Xiaowei and others}, booktitle = {kdd}, number = {34}, @@ -775,8 +808,11 @@ def test_bibtex(self): """ ) - pub = scholarly.search_single_pub("A density-based algorithm for discovering clusters in large " - "spatial databases with noise", filled=True) + pub = scholarly.search_single_pub( + "A density-based algorithm for discovering clusters in large " + "spatial databases with noise", + filled=True, + ) result = scholarly.bibtex(pub) self.assertEqual(result, expected_result.replace("\n ", "\n")) @@ -792,13 +828,16 @@ def test_search_pubs(self): pubs = list(scholarly.search_pubs('"naive physics" stability "3d shape"')) # Check that the first entry in pubs is the same as pub. # Checking for quality holds for non-dict entries only. - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {"author_id", "pub_url", "num_citations"}: self.assertEqual(pub[key], pubs[0][key]) - for key in {'title', 'pub_year', 'venue'}: - self.assertEqual(pub['bib'][key], pubs[0]['bib'][key]) + for key in {"title", "pub_year", "venue"}: + self.assertEqual(pub["bib"][key], pubs[0]["bib"][key]) self.assertGreaterEqual(len(pubs), 27) - titles = [p['bib']['title'] for p in pubs] - self.assertIn('Visual perception of the physical stability of asymmetric three-dimensional objects', titles) + titles = [p["bib"]["title"] for p in pubs] + self.assertIn( + "Visual perception of the physical stability of asymmetric three-dimensional objects", + titles, + ) def test_search_pubs_total_results(self): """ @@ -811,94 +850,126 @@ def test_search_pubs_total_results(self): pubs = scholarly.search_pubs('"naive physics" stability "3d shape"') self.assertGreaterEqual(pubs.total_results, 32) - pubs = scholarly.search_pubs('WIEN2k Blaha') + pubs = scholarly.search_pubs("WIEN2k Blaha") self.assertGreaterEqual(pubs.total_results, 10000) - pubs = scholarly.search_pubs('sdfsdf+24r+asdfasdf') + pubs = scholarly.search_pubs("sdfsdf+24r+asdfasdf") self.assertEqual(pubs.total_results, 0) def test_search_pubs_filling_publication_contents(self): - ''' + """ This process checks the process of filling a publication that is derived from the search publication snippets. - ''' - query = 'Creating correct blur and its effect on accommodation' + """ + query = "Creating correct blur and its effect on accommodation" results = scholarly.search_pubs(query) pubs = [p for p in results] self.assertGreaterEqual(len(pubs), 1) f = scholarly.fill(pubs[0]) - self.assertTrue(f['bib']['author'] == u'Cholewiak, Steven A and Love, Gordon D and Banks, Martin S') - self.assertTrue(f['author_id'] == ['4bahYMkAAAAJ', '3xJXtlwAAAAJ', 'Smr99uEAAAAJ']) - self.assertTrue(f['bib']['journal'] == u'Journal of Vision') - self.assertTrue(f['bib']['number'] == '9') - self.assertTrue(f['bib']['pages'] == u'1--1') - self.assertTrue(f['bib']['publisher'] == u'The Association for Research in Vision and Ophthalmology') - self.assertTrue(f['bib']['title'] == u'Creating correct blur and its effect on accommodation') - self.assertTrue(f['pub_url'] == u'https://jov.arvojournals.org/article.aspx?articleid=2701817') - self.assertTrue(f['bib']['volume'] == '18') - self.assertTrue(f['bib']['pub_year'] == u'2018') + self.assertTrue( + f["bib"]["author"] + == "Cholewiak, Steven A and Love, Gordon D and Banks, Martin S" + ) + self.assertTrue( + f["author_id"] == ["4bahYMkAAAAJ", "3xJXtlwAAAAJ", "Smr99uEAAAAJ"] + ) + self.assertTrue(f["bib"]["journal"] == "Journal of Vision") + self.assertTrue(f["bib"]["number"] == "9") + self.assertTrue(f["bib"]["pages"] == "1--1") + self.assertTrue( + f["bib"]["publisher"] + == "The Association for Research in Vision and Ophthalmology" + ) + self.assertTrue( + f["bib"]["title"] == "Creating correct blur and its effect on accommodation" + ) + self.assertTrue( + f["pub_url"] + == "https://jov.arvojournals.org/article.aspx?articleid=2701817" + ) + self.assertTrue(f["bib"]["volume"] == "18") + self.assertTrue(f["bib"]["pub_year"] == "2018") def test_related_articles_from_author(self): """ Test that we obtain related articles to an article from an author """ author = scholarly.search_author_id("ImhakoAAAAAJ") - scholarly.fill(author, sections=['basics', 'publications']) - pub = author['publications'][0] - self.assertEqual(pub['bib']['title'], 'Prospect theory: An analysis of decision under risk') - self.assertEqual(pub['bib']['citation'], 'Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013') + scholarly.fill(author, sections=["basics", "publications"]) + pub = author["publications"][0] + self.assertEqual( + pub["bib"]["title"], "Prospect theory: An analysis of decision under risk" + ) + self.assertEqual( + pub["bib"]["citation"], + "Handbook of the fundamentals of financial decision making: Part I, 99-127, 2013", + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) self.assertEqual(pub["pub_url"], same_article["pub_url"]) - for key in {'title', 'pub_year'}: - self.assertEqual(str(pub['bib'][key]), (same_article['bib'][key])) + for key in {"title", "pub_year"}: + self.assertEqual(str(pub["bib"][key]), (same_article["bib"][key])) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Advances in prospect theory: Cumulative representation of uncertainty') - self.assertEqual(related_article['bib']['pub_year'], '1992') - self.assertGreaterEqual(related_article['num_citations'], 18673) - self.assertIn("A Tversky", related_article['bib']['author']) + self.assertEqual( + related_article["bib"]["title"], + "Advances in prospect theory: Cumulative representation of uncertainty", + ) + self.assertEqual(related_article["bib"]["pub_year"], "1992") + self.assertGreaterEqual(related_article["num_citations"], 18673) + self.assertIn("A Tversky", related_article["bib"]["author"]) def test_related_articles_from_publication(self): """ Test that we obtain related articles to an article from a search """ - pub = scholarly.search_single_pub("Planck 2018 results-VI. Cosmological parameters") + pub = scholarly.search_single_pub( + "Planck 2018 results-VI. Cosmological parameters" + ) related_articles = scholarly.get_related_articles(pub) # Typically, the same publication is returned as the most related article same_article = next(related_articles) - for key in {'author_id', 'pub_url', 'num_citations'}: + for key in {"author_id", "pub_url", "num_citations"}: self.assertEqual(pub[key], same_article[key]) - for key in {'title', 'pub_year'}: - self.assertEqual(pub['bib'][key], same_article['bib'][key]) + for key in {"title", "pub_year"}: + self.assertEqual(pub["bib"][key], same_article["bib"][key]) # These may change with time related_article = next(related_articles) - self.assertEqual(related_article['bib']['title'], 'Large Magellanic Cloud Cepheid standards provide ' - 'a 1% foundation for the determination of the Hubble constant and stronger evidence ' - 'for physics beyond ΛCDM') - self.assertEqual(related_article['bib']['pub_year'], '2019') - self.assertGreaterEqual(related_article['num_citations'], 1388) - self.assertIn("AG Riess", related_article['bib']['author']) + self.assertEqual( + related_article["bib"]["title"], + "Large Magellanic Cloud Cepheid standards provide " + "a 1% foundation for the determination of the Hubble constant and stronger evidence " + "for physics beyond ΛCDM", + ) + self.assertEqual(related_article["bib"]["pub_year"], "2019") + self.assertGreaterEqual(related_article["num_citations"], 1388) + self.assertIn("AG Riess", related_article["bib"]["author"]) def test_pubs_custom_url(self): """ Test that we can use custom URLs for retrieving publication data """ - query_url = ('/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' - 'as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31') + query_url = ( + '/scholar?as_q=&as_epq=&as_oq=SFDI+"modulated+imaging"&as_eq=&as_occt=any&as_sauthors=&' + "as_publication=&as_ylo=2005&as_yhi=2020&hl=en&as_sdt=0%2C31" + ) pubs = scholarly.search_pubs_custom_url(query_url) pub = next(pubs) - self.assertEqual(pub['bib']['title'], 'Quantitation and mapping of tissue optical properties using modulated imaging') - self.assertEqual(set(pub['author_id']), {'V-ab9U4AAAAJ', '4k-k6SEAAAAJ', 'GLm-SaQAAAAJ'}) - self.assertEqual(pub['bib']['pub_year'], '2009') - self.assertGreaterEqual(pub['num_citations'], 581) + self.assertEqual( + pub["bib"]["title"], + "Quantitation and mapping of tissue optical properties using modulated imaging", + ) + self.assertEqual( + set(pub["author_id"]), {"V-ab9U4AAAAJ", "4k-k6SEAAAAJ", "GLm-SaQAAAAJ"} + ) + self.assertEqual(pub["bib"]["pub_year"], "2009") + self.assertGreaterEqual(pub["num_citations"], 581) def check_citedby_1k(self, pub): - """A common checking method to check - """ + """A common checking method to check""" original_citation_count = pub["num_citations"] # Trigger a different code path if original_citation_count <= 1000: @@ -908,13 +979,19 @@ def check_citedby_1k(self, pub): self.assertEqual(len(citation_list), original_citation_count) return citation_list - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_citedby_1k_citations(self): - """Test that scholarly can fetch 1000+ citations from an author - """ - author = scholarly.search_author_id('QoX9bu8AAAAJ') - scholarly.fill(author, sections=['publications']) - pub = [_p for _p in author['publications'] if _p["author_pub_id"]=="QoX9bu8AAAAJ:L8Ckcad2t8MC"][0] + """Test that scholarly can fetch 1000+ citations from an author""" + author = scholarly.search_author_id("QoX9bu8AAAAJ") + scholarly.fill(author, sections=["publications"]) + pub = [ + _p + for _p in author["publications"] + if _p["author_pub_id"] == "QoX9bu8AAAAJ:L8Ckcad2t8MC" + ][0] scholarly.fill(pub) citation_list = self.check_citedby_1k(pub) @@ -922,10 +999,12 @@ def test_citedby_1k_citations(self): for year, count in pub["cites_per_year"].items(): self.assertEqual(yearwise_counter.get(str(year), 0), count) - @unittest.skipIf(os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, reason="No robust proxy setup") + @unittest.skipIf( + os.getenv("CONNECTION_METHOD") in {None, "none", "freeproxy"}, + reason="No robust proxy setup", + ) def test_citedby_1k_scholar(self): - """Test that scholarly can fetch 1000+ citations from a pub search. - """ + """Test that scholarly can fetch 1000+ citations from a pub search.""" title = "Persistent entanglement in a class of eigenstates of quantum Heisenberg spin glasses" pubs = scholarly.search_pubs(title) pub = next(pubs)