From dd675f2dad29e3e486128ad0773c4f55cedd8e67 Mon Sep 17 00:00:00 2001 From: Kelly Davis Date: Fri, 4 Apr 2025 14:42:03 -0400 Subject: [PATCH 1/3] loader updates, external docs --- docs/external.md | 270 +++++++++++++++++++++++ lux_pipeline/process/_task_ui_manager.py | 2 + lux_pipeline/process/base/loader.py | 40 +++- lux_pipeline/process/download_manager.py | 2 +- lux_pipeline/sources/dnb/loader.py | 1 - 5 files changed, 307 insertions(+), 8 deletions(-) create mode 100644 docs/external.md diff --git a/docs/external.md b/docs/external.md new file mode 100644 index 00000000..4508f2fd --- /dev/null +++ b/docs/external.md @@ -0,0 +1,270 @@ +# External Data Sources + +## Implementation Status + +| Source | Fetch | Map | Name Reconcile | Load | IndexLoad | ActivityStream | +| --------------- | ----- | --- | -------------- | ---- | ------- | -------------- | +| AAT*^ | ✅ | ✅ | ✅ | N/A | ✅ | ✅ | +| DNB | ✅ | ✅ | - | ✅ | N/A | N/A | +| FAST | ✅ | - | - | ✅ | N/A | N/A | +| Geonames | ✅ | ✅ | - | ✅ | N/A | N/A | +| LCNAF *^ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| LCSH *^ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| TGN | ✅ | ✅ | - | N/A | N/A | ✅ | +| ULAN *^ | ✅ | ✅ | ✅ | N/A | ✅ | ✅ | +| VIAF ^ | ✅ | ✅ | - | ✅ | ✅ | N/A | +| Who's on First | ✅ | ✅ | - | N/A | N/A | N/A | +| Wikidata ^ | ✅ | ✅ | - | ✅ | ✅ | N/A | +| Japan NDL | ✅ | ✅ | - | N/A | N/A | N/A | +| BNF | ✅ | ✅ | - | ✅ | N/A | N/A | +| GBIF | ✅ | ✅ | - | N/A | N/A | N/A | +| ORCID | ✅ | ✅ | - | ✅ | N/A | N/A | +| ROR | ✅ | ✅ | - | ✅ | N/A | N/A | +| Wikimedia | ✅ | ✅ | - | N/A | N/A | N/A | +| DNB | ✅ | ✅ | - | ✅ | N/A | N/A | +| BNE | ✅ | ✅ | - | ✅ | N/A | N/A | +| SNAC | ✅ | ✅ | - | N/A | N/A | N/A | +| Homosaurus | ✅ | ✅ | - | ✅ | N/A | N/A | +| Nomisma | ✅ | ✅ | - | ✅ | N/A | N/A | + + + + +✅ = Done ; - = Not started ; N/A = Can't/Won't be done + + +### Notes: +- Indicates name is indexed: `*` +- Indicates URI is indexed: `^` + +--- + +## External Source Details + +- **Getty Vocabularies**: Authoritative, structured thesauri and union list used for cataloging, research, and interoperability in art, architecture, and cultural heritage domains. + - ActivityStreams are updated monthly. + - LUX harvests the following datasets, all available via the [main vocab AS](https://data.getty.edu/vocab/activity-stream/). + - AAT (Art & Architecture Thesaurus) + - Linked.art Class: Concept + - TGN (Thesaurus of Geographic Names) + - Linked.art Class: Place + - ULAN (Union List of Artist Names) + - Linked.art Class: Person, Group + - Format: JSON-LD + + - Individual records can be fetched at (e.g.): + `https://vocab.getty.edu/aat/{identifier}.jsonld` + +- **DNB (German National Library)**: A comprehensive repository of bibliographic and authority data for German-speaking regions. + - Dump files are updated monthly. + - LUX harvests the following datasets: + - [Sachbegriff](https://data.dnb.de/opendata/authorities-gnd-sachbegriff_lds.jsonld.gz) + - Linked.art Class: Concept, Group + - [Entity Facts](https://data.dnb.de/opendata/authorities-gnd_entityfacts.jsonld.gz) + - Linked.art Class: Person, Group, Place, Event + - [Mapped Authorities](https://data.dnb.de/opendata/mapping-authorities-gnd-lcsh-ram_lds.jsonld.gz) + - Linked.art Property: equivalent + + - Format: JSON-LD + + - Individual records can be fetched at: + `https://hub.culturegraph.org/entityfacts/{identifier}` + +- **Geonames**: Geographical database that provides data on over 25 million places worldwide, including names, coordinates, and other metadata. + - Dump files are updated daily. + - LUX harvests the following datasets: + - [All Countries](https://download.geonames.org/export/dump/allCountries.zip) + - Linked.art Class: Place + - [Alternate Names V2](https://download.geonames.org/export/dump/alternateNamesV2.zip) + - Linked.art Property: identifiedBy + - [Hierarchy](https://download.geonames.org/export/dump/hierarchy.zip) + - Linked.art Property: partOf + + - Format: CSV/RDF + + - Individual records can be fetched at: + `https://sws.geonames.org/{identifier}/about.rdf` + +- **FAST (Faceted Application of Subject Terminology)**: Simplified subject vocabulary derived from the Library of Congress Subject Headings (LCSH). + - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset. + - LUX harvests the following dataset: + - [FAST ALL](https://researchworks.oclc.org/researchdata/fast/FASTAll.marcxml.zip) + - Linked.art Class: *not yet mapped* + + - Format: MARC-XML + + - Individual records can be fetched at: + `https://id.worldcat.org/fast/{identifier}.rdf.xml` + +- **Wikidata**: Collaborative, multilingual, and structured knowledge base that stores linked data to support Wikimedia projects and beyond. + - Dump file updated weekly, typically on Mondays. + - LUX harvests the following dataset: + - [Wikidata Latest All](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz) + - Linked.art Class: HumanMadeObject, LinguisticObject, Person, Group, Concept, Place, Event + + - Format: JSON-LD + + - Individual records can be fetched at: + `https://www.wikidata.org/wiki/Special:EntityData/{identifier}.json` + +- **Wikimedia Commons**: Free, collaborative media repository that hosts millions of openly licensed images, videos, audio files, and other media. + - No dump file available. LUX fetches images as they are referenced. LUX only fetches images with the following licenses: + - pd, cc0, cc-by-sa-4.0, cc-by-4.0 + - Linked.art Class: Digital Object, related to their representational Class via the `representation` property. + + - Format: LUX only fetches images in .jpg, .jpeg, .gif and .png format. More are available via Wikimedia Commons. + + - Individual images can be fetched at: + `https://en.wikipedia.org/w/api.php?action=query&prop=imageinfo&iiprop=extmetadata&titles=File:{identifier}&format=json` + + +- **Library of Congress**: Structured, machine-readable representations of authoritative bibliographic, subject, and name data. + - Dump files do not have a specified update frequency. + - LUX harvests the following datasets: + - [NAF (Name Authority File)](https://id.loc.gov/download/authorities/names.madsrdf.jsonld.gz) + - Linked.art Class: Person, Group, Place, Activity, Period + - [SH (Subject Headings)](https://id.loc.gov/download/authorities/subjects.madsrdf.jsonld.gz) + - Linked.art Class: Concept + - [DGT (Demographic Group Terms)](https://id.loc.gov/download/authorities/demographicTerms.madsrdf.jsonld.gz) + - Linked.art Class: Concept + + - As of 2025, LUX prefers the Activity Stream for NAF and SH. + - `https://id.loc.gov/authorities/names/activitystreams/feed/1` + - `https://id.loc.gov/authorities/subjects/activitystreams/feed/1` + + - Format: JSON-LD/MADS/RDF + + - Individual records can be fetched at (e.g.): + `http://id.loc.gov/authorities/names/{identifier}.json` + + +- **ORCID**: Unique, persistent identifier system for researchers and scholars. + - Dump file is updated yearly in October. + - LUX harvests the following dataset: + - [2024 Summaries](https://orcid.figshare.com/ndownloader/files/49560102) + - Linked.art Class: Person + + - Format: XML + + - Individual records can be fetched via Orcid's API, but LUX relies solely on the dump file. + +- **ROR (Research Organization Registry)**: Global, open, and community-driven registry of unique identifiers for research organizations. + - Dump file is updated monthly. + - LUX harvests the following dataset: + - [ROR Data](https://zenodo.org/records/14429114/files/v1.58-2024-12-11-ror-data.zip) + - Linked.art Class: Group + + - Format: JSON + + - Individual records can be fetched at: + `https://api.ror.org/organizations/{identifier}` + +- **VIAF (Virtual International Authority File)**: International service that consolidates and links authority data for names of people, organizations, and more, from libraries and cultural institutions worldwide. + - Dump files are typically updated monthly. However, as of August 2024, updating is on pause while VIAF undergoes security and production environment improvements. + - LUX harvests the following dataset: + - [VIAF Clusters](https://viaf.org/viaf/data/viaf-20240804-clusters.xml.gz) + - Linked.art Class: Person, Group, Place + + - Format: XML + + - Individual records can be fetched at: + `https://viaf.org/viaf/{identifier}/viaf.xml` + + +- **Who’s on First (WOF)**: Open-source gazetteer and database of geographic places, providing unique identifiers and metadata for locations worldwide. + - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset. + - LUX harvests the following dataset: + - [WOF Global Latest](https://data.geocode.earth/wof/dist/sqlite/whosonfirst-data-admin-latest.db.bz2) + - Linked.art Class: Place + + - Format: SQLite database + + - Individual records can be fetched at: + `https://data.whosonfirst.org/{identifier}` + +- **SNAC (Social Networks and Archival Context)**: Cooperative initiative to discover biographical and historical information about people, families, and organizations, connecting them through archival records. + - No dump file available. LUX fetches records as they are referenced. + - Linked.art Class: Person, Group + + - Format: JSON + + - Individual records can be fetched at: + `https://snaccooperative.org/download?arkid=http://n2t.net/ark:/99166/{identifier}&type=constellation_json` + + +- **GBIF (Global Biodiversity Information Facility)**: International network and data platform that provides open access to biodiversity data, enabling research on species distribution and ecosystems worldwide. + - No dump file of the entire dataset is available. LUX fetches records as they are referenced, usually from Yale Peabody Museum taxonomic records. + - Linked.art Class: Concept + + - Format: JSON + + - Individual records can be fetched at: + `https://api.gbif.org/v1/species/{identifier}` + +- **Homosaurus**: International LGBTQ+ linked data vocabulary that provides standardized terms to improve the discovery and organization of LGBTQ+ resources in libraries, archives, and other information systems. + - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset. + - LUX harvests the following dataset: + - [V3](https://homosaurus.org/v3.jsonld) + - Linked.art Class: Concept + + - Format: JSON-LD + + - Individual records can be fetched at: + `https://homosaurus.org/v3/{identifier}.jsonld` + +- **Nomisma**: Collaborative project that provides a linked open data vocabulary and digital resource for numismatics, focusing on the study of coins, currency, and related objects. + - Dump files are updated nightly. + - LUX harvests the following dataset: + - [Nomisma](https://nomisma.org/nomisma.org.jsonld) + - Linked.art Class: Person, Group, Place, Concept + + - Format: JSON-LD + + - Individual records can be fetched at: + `http://nomisma.org/id/{identifier}.jsonld` + + +- **BNE (Biblioteca Nacional de España)**: National Library of Spain, which provides access to Spain's cultural and historical heritage through its collection of books, manuscripts, maps, and digital resources. + - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset. + - LUX harvests the following datasets: + - [Entidad](https://www.bne.es/media/datosgob/catalogo-autoridades/entidad/entidad-JSON.zip) + - Linked.art Class: Group + - [Materia](https://www.bne.es/media/datosgob/catalogo-autoridades/materia/materia-JSON.zip) + - Linked.art Class: Concept + - [Geografico](https://www.bne.es/media/datosgob/catalogo-autoridades/geografico/geografico-JSON.zip) + - Linked.art Class: Place + - [Persona](https://www.bne.es/media/datosgob/catalogo-autoridades/persona/persona-JSON.zip) + - Linked.art Class: Person + + - Format: JSON + + - Individual records can be fetched at: + `https://datos.bne.es/resource/{identifier}.jsonld` + +- **BNF (Bibliothèque nationale de France)**: National library of France, preserving and providing access to a vast collection of books, manuscripts, and cultural heritage materials. + - In the past, LUX relied on the BNF's RDF/JSON-LD for harvesting, however this service has not been consistently available. As a result, we swapped to the XML dump files. + - Dump files do not have a specified update frequency. + - LUX harvests the following datasets: + - [DataBNF Rameau NoSubjects](https://transfert.bnf.fr/link/c26ba50e-17c4-46fe-b6d8-8c2ad393f40e) + - Linked.art Class: Concept + - [DataBNF Person Authors](https://transfert.bnf.fr/link/c412f451-2bf2-45a7-b76b-a11d563c2a8a) + - Linked.art Class: People + - [DataBNF Org Authors](https://transfert.bnf.fr/link/2a2b3690-f642-4644-8615-9b50b59c84d9) + - Linked.art Class: Group + - [DataBNF Geos](https://transfert.bnf.fr/link/86ea06b4-2590-4d1c-8e1e-126eff24b535) + - Linked.art Class: Place + + - Format: XML* see note about RDF/JSON-LD above + + - If service is available, individual records can be fetched at: + `https://data.bnf.fr/ark:/12148/{identifier}.rdfjsonld` + + +- **Japan NDL (Japanese National Diet Library)**: Provides access to a wide range of bibliographic and authority data, enabling researchers and institutions to retrieve and utilize information from the NDL's extensive collections. + - While dump files are available for subject headings, LUX retrieves records as referenced. + + - Format: JSON-LD + + - Individual records can be fetched at, e.g.: + `https://id.ndl.go.jp/auth/ndlsh/{identifier}.json` + diff --git a/lux_pipeline/process/_task_ui_manager.py b/lux_pipeline/process/_task_ui_manager.py index 6e7aa116..2fe8fd56 100644 --- a/lux_pipeline/process/_task_ui_manager.py +++ b/lux_pipeline/process/_task_ui_manager.py @@ -7,6 +7,8 @@ import logging logger = logging.getLogger("lux_pipeline") import traceback +import os + class TaskLogHandler(logging.Handler): def __init__(self, manager): diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py index a4b02aef..309e78bb 100644 --- a/lux_pipeline/process/base/loader.py +++ b/lux_pipeline/process/base/loader.py @@ -1,16 +1,15 @@ - +import re import io import os import requests import shutil import time import gzip -import bz2 import zipfile import tarfile import ujson as json import logging -logger = logging.getLogger("lux_pipeline") + try: import magic except: @@ -233,7 +232,7 @@ def iterate_tar(self, path, comp, remaining): mode = "r" with tarfile.open(path, mode) as th: if self.increment_total and len(remaining) == 1: - names = th.namelist() + names = th.getnames() self.update_progress_bar(increment_total=len(names)) del names ti = th.next() @@ -373,6 +372,7 @@ def make_json(self, path, comp, parent): return {'identifier': ident, 'data': data} def make_other(self, path, comp, parent): + print(f"path is {path}") ident = self.make_identifier(path) with self.file_opener(path, comp) as fh: data = fh.read() @@ -393,6 +393,11 @@ def make_identifier(self, value): value = value.name elif isinstance(value, bytes): value = value.decode('utf-8') + + # end of file name is invalid for xml files + if isinstance(value, str) and value.endswith(".xml"): + return None + try: last = value.split('/')[-1] return last.split('.')[0] @@ -411,7 +416,30 @@ def post_process_json(self, data): def post_process_other(self, data): # This is called after discovering the record and before extracting the identifier - return data + if isinstance(data, bytes): + try: + data = data.decode("utf-8") + except UnicodeDecodeError: + data = data.decode("utf-8", errors="replace") + + match = re.search( + r''' + rdf:about="https?://data\.bnf\.fr/ark:/12148/(?Pcb\d{9})"(?!"|#) | + (?P[^<]+) + ''', + data, + re.VERBOSE, + ) + + if match: + ident = match.group("bnf") or match.group("fast") + print(f"ident is {ident}") + + result = {"raw": data} + if ident: + result["id"] = ident + + return result def should_make_record(self, path): if self.max_slice > 1 and self.seen % self.max_slice != self.my_slice: @@ -463,7 +491,7 @@ def store_record(self, record): try: self.out_cache[identifier] = data except Exception as e: - logger.error(e) + print(e) return False self.increment_progress_bar(1) return True diff --git a/lux_pipeline/process/download_manager.py b/lux_pipeline/process/download_manager.py index 6a8ac950..1e0b7fa8 100644 --- a/lux_pipeline/process/download_manager.py +++ b/lux_pipeline/process/download_manager.py @@ -1,4 +1,4 @@ - +import logging from ._task_ui_manager import TaskUiManager from lux_pipeline.cli._rich import get_bar_from_layout diff --git a/lux_pipeline/sources/dnb/loader.py b/lux_pipeline/sources/dnb/loader.py index cd3ca1d7..988058be 100644 --- a/lux_pipeline/sources/dnb/loader.py +++ b/lux_pipeline/sources/dnb/loader.py @@ -26,7 +26,6 @@ def iterate_sachbegriff(self, path, comp, parent): class OldDnbLoader: - def __init__(self, config): Loader.__init__(self, config) d = config['all_configs'].data_dir From 4ee5d820d9c6de7654baef7825597bb357d435e0 Mon Sep 17 00:00:00 2001 From: Kelly Davis Date: Fri, 4 Apr 2025 14:45:31 -0400 Subject: [PATCH 2/3] removed print statements, added back logging --- lux_pipeline/process/base/loader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py index 309e78bb..38c447b2 100644 --- a/lux_pipeline/process/base/loader.py +++ b/lux_pipeline/process/base/loader.py @@ -5,11 +5,14 @@ import shutil import time import gzip +import bz2 import zipfile import tarfile import ujson as json import logging +logger = logging.getLogger("lux_pipeline") + try: import magic except: @@ -372,7 +375,6 @@ def make_json(self, path, comp, parent): return {'identifier': ident, 'data': data} def make_other(self, path, comp, parent): - print(f"path is {path}") ident = self.make_identifier(path) with self.file_opener(path, comp) as fh: data = fh.read() @@ -433,7 +435,6 @@ def post_process_other(self, data): if match: ident = match.group("bnf") or match.group("fast") - print(f"ident is {ident}") result = {"raw": data} if ident: @@ -491,7 +492,7 @@ def store_record(self, record): try: self.out_cache[identifier] = data except Exception as e: - print(e) + logger.error(e) return False self.increment_progress_bar(1) return True From eb65d85422d1a045245034b7d68b5094354525ea Mon Sep 17 00:00:00 2001 From: Kelly Davis Date: Mon, 14 Apr 2025 10:54:40 -0400 Subject: [PATCH 3/3] removed xml handling from base loader, created bnf loader --- lux_pipeline/process/base/loader.py | 24 +------------ lux_pipeline/sources/bnf/loader.py | 53 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 23 deletions(-) create mode 100644 lux_pipeline/sources/bnf/loader.py diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py index d9dc656f..5c601302 100644 --- a/lux_pipeline/process/base/loader.py +++ b/lux_pipeline/process/base/loader.py @@ -418,29 +418,7 @@ def post_process_json(self, data): def post_process_other(self, data): # This is called after discovering the record and before extracting the identifier - if isinstance(data, bytes): - try: - data = data.decode("utf-8") - except UnicodeDecodeError: - data = data.decode("utf-8", errors="replace") - - match = re.search( - r''' - rdf:about="https?://data\.bnf\.fr/ark:/12148/(?Pcb\d{9})"(?!"|#) | - (?P[^<]+) - ''', - data, - re.VERBOSE, - ) - - if match: - ident = match.group("bnf") or match.group("fast") - - result = {"raw": data} - if ident: - result["id"] = ident - - return result + return data def should_make_record(self, path): if self.max_slice > 1 and self.seen % self.max_slice != self.my_slice: diff --git a/lux_pipeline/sources/bnf/loader.py b/lux_pipeline/sources/bnf/loader.py new file mode 100644 index 00000000..d1b2494f --- /dev/null +++ b/lux_pipeline/sources/bnf/loader.py @@ -0,0 +1,53 @@ +import re +import logging +from lux_pipeline.process.base.loader import Loader + +logger = logging.getLogger("lux_pipeline") + + +class BnfLoader(Loader): + def __init__(self, config): + super().__init__(config) + self._temp_records = {} + + def extract_identifier(self, data): + if isinstance(data, bytes): + try: + data = data.decode("utf-8") + except UnicodeDecodeError: + data = data.decode("utf-8", errors="replace") + + match = re.search(r'https?://data\.bnf\.fr/ark:/12148/([^"#<>\s]+)', data) + if match: + return match.group(1) + else: + logger.warning(f"BNF loader can't find an identifier for {data[:200]}...") + return None + + def post_process_other(self, data): + return {'raw': data} # Wrap raw XML as-is + + def store_record(self, record): + # Collect all records by identifier, defer storing + ident = record["identifier"] + if ident not in self._temp_records: + self._temp_records[ident] = [] + self._temp_records[ident].append(record["data"]) + return True # Don't store in out_cache yet + + def load(self, disable_ui=False, overwrite=True): + # Run base loading (parsing + buffering only) + super().load(disable_ui=disable_ui, overwrite=overwrite) + + # Merge and store final data + for ident, records in self._temp_records.items(): + combined = "\n".join(r["raw"] for r in records) + record = {"identifier": ident, "data": {"raw": combined}} + + if self.should_store_record(record): + try: + self.out_cache[ident] = record["data"] + self.post_store_record(record) + self.increment_progress_bar(1) + except Exception as e: + logger.error(f"Failed to store merged BNF record {ident}: {e}")