From dd675f2dad29e3e486128ad0773c4f55cedd8e67 Mon Sep 17 00:00:00 2001
From: Kelly Davis <kelly.davis@yale.edu>
Date: Fri, 4 Apr 2025 14:42:03 -0400
Subject: [PATCH 1/3] loader updates, external docs

---
 docs/external.md                         | 270 +++++++++++++++++++++++
 lux_pipeline/process/_task_ui_manager.py |   2 +
 lux_pipeline/process/base/loader.py      |  40 +++-
 lux_pipeline/process/download_manager.py |   2 +-
 lux_pipeline/sources/dnb/loader.py       |   1 -
 5 files changed, 307 insertions(+), 8 deletions(-)
 create mode 100644 docs/external.md

diff --git a/docs/external.md b/docs/external.md
new file mode 100644
index 00000000..4508f2fd
--- /dev/null
+++ b/docs/external.md
@@ -0,0 +1,270 @@
+# External Data Sources
+
+## Implementation Status
+
+| Source          | Fetch | Map | Name Reconcile | Load | IndexLoad | ActivityStream |
+| --------------- | ----- | --- | -------------- | ---- | ------- | -------------- |
+| AAT*^           |   ✅  |  ✅ |    ✅         | N/A | ✅      | ✅ |
+| DNB             |   ✅  |  ✅ |     -         |  ✅ | N/A       | N/A |
+| FAST            |   ✅  |  -  |     -          | ✅  | N/A     | N/A |
+| Geonames        |   ✅  |  ✅ |     -          | ✅  | N/A      | N/A |
+| LCNAF *^        |   ✅  |  ✅ |     ✅        |  ✅ | ✅       | ✅ |
+| LCSH *^         |   ✅  |  ✅ |     ✅        |  ✅ | ✅      | ✅ |
+| TGN             |   ✅  |  ✅ |     -         | N/A  | N/A      | ✅ |
+| ULAN *^         |   ✅  |  ✅ |     ✅        | N/A  | ✅      | ✅ |
+| VIAF ^          |   ✅  |  ✅ |     -         |  ✅  | ✅       | N/A |
+| Who's on First  |   ✅  |  ✅ |     -         | N/A  | N/A       | N/A |
+| Wikidata ^      |   ✅  |  ✅ |     -         |  ✅  | ✅     | N/A |
+| Japan NDL        |   ✅  |  ✅ |     -         | N/A  | N/A       | N/A |
+| BNF             |   ✅  |  ✅ |     -         | ✅  | N/A       | N/A |
+| GBIF            |   ✅  |  ✅ |     -         | N/A  | N/A       | N/A |
+| ORCID           |   ✅  |  ✅ |     -         | ✅  | N/A       | N/A |
+| ROR             |   ✅  |  ✅ |     -         | ✅  | N/A       | N/A |
+| Wikimedia       |   ✅  |  ✅ |     -         | N/A  | N/A       | N/A |
+| DNB             |   ✅  |  ✅ |     -         | ✅  | N/A       | N/A |
+| BNE             |   ✅  |  ✅ |     -         | ✅  | N/A       | N/A |
+| SNAC            |   ✅  |  ✅ |     -         | N/A  | N/A       | N/A |
+| Homosaurus      |   ✅  |  ✅ |     -          | ✅ | N/A      | N/A |
+| Nomisma         |   ✅  |  ✅ |     -          | ✅ | N/A      | N/A |
+
+
+
+
+✅ = Done ; - = Not started ; N/A = Can't/Won't be done
+
+
+### Notes:
+- Indicates name is indexed: `*`
+- Indicates URI is indexed: `^`
+
+---
+
+## External Source Details
+
+- **Getty Vocabularies**: Authoritative, structured thesauri and union list used for cataloging, research, and interoperability in art, architecture, and cultural heritage domains.
+  - ActivityStreams are updated monthly.
+  - LUX harvests the following datasets, all available via the [main vocab AS](https://data.getty.edu/vocab/activity-stream/).
+    - AAT (Art & Architecture Thesaurus) 
+      - Linked.art Class: Concept 
+    - TGN (Thesaurus of Geographic Names) 
+      - Linked.art Class: Place 
+    - ULAN (Union List of Artist Names) 
+      - Linked.art Class: Person, Group
+  - Format: JSON-LD
+
+  - Individual records can be fetched at (e.g.):
+    `https://vocab.getty.edu/aat/{identifier}.jsonld`
+
+- **DNB (German National Library)**: A comprehensive repository of bibliographic and authority data for German-speaking regions.   
+  - Dump files are updated monthly.
+  - LUX harvests the following datasets:
+    - [Sachbegriff](https://data.dnb.de/opendata/authorities-gnd-sachbegriff_lds.jsonld.gz)
+      - Linked.art Class: Concept, Group
+    - [Entity Facts](https://data.dnb.de/opendata/authorities-gnd_entityfacts.jsonld.gz)
+      - Linked.art Class: Person, Group, Place, Event
+    - [Mapped Authorities](https://data.dnb.de/opendata/mapping-authorities-gnd-lcsh-ram_lds.jsonld.gz)
+      - Linked.art Property: equivalent
+
+  - Format: JSON-LD
+  
+  - Individual records can be fetched at:
+    `https://hub.culturegraph.org/entityfacts/{identifier}`
+
+- **Geonames**: Geographical database that provides data on over 25 million places worldwide, including names, coordinates, and other metadata.
+  - Dump files are updated daily.
+  - LUX harvests the following datasets:
+    - [All Countries](https://download.geonames.org/export/dump/allCountries.zip)
+      - Linked.art Class: Place
+    - [Alternate Names V2](https://download.geonames.org/export/dump/alternateNamesV2.zip)
+      - Linked.art Property: identifiedBy
+    - [Hierarchy](https://download.geonames.org/export/dump/hierarchy.zip)
+      - Linked.art Property: partOf
+
+  - Format: CSV/RDF 
+
+  - Individual records can be fetched at:
+      `https://sws.geonames.org/{identifier}/about.rdf`
+
+- **FAST (Faceted Application of Subject Terminology)**:  Simplified subject vocabulary derived from the Library of Congress Subject Headings (LCSH).
+  - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset.
+  - LUX harvests the following dataset:
+    - [FAST ALL](https://researchworks.oclc.org/researchdata/fast/FASTAll.marcxml.zip)
+      - Linked.art Class: *not yet mapped*
+
+  - Format: MARC-XML 
+  
+  - Individual records can be fetched at:
+      `https://id.worldcat.org/fast/{identifier}.rdf.xml`
+
+- **Wikidata**: Collaborative, multilingual, and structured knowledge base that stores linked data to support Wikimedia projects and beyond.
+  - Dump file updated weekly, typically on Mondays. 
+  - LUX harvests the following dataset:
+    - [Wikidata Latest All](https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.gz)
+      - Linked.art Class: HumanMadeObject, LinguisticObject, Person, Group, Concept, Place, Event
+
+  - Format: JSON-LD
+
+  - Individual records can be fetched at:
+      `https://www.wikidata.org/wiki/Special:EntityData/{identifier}.json`
+
+- **Wikimedia Commons**: Free, collaborative media repository that hosts millions of openly licensed images, videos, audio files, and other media.
+  - No dump file available. LUX fetches images as they are referenced. LUX only fetches images with the following licenses:
+    - pd, cc0, cc-by-sa-4.0, cc-by-4.0
+    - Linked.art Class: Digital Object, related to their representational Class via the `representation` property.
+
+  - Format: LUX only fetches images in .jpg, .jpeg, .gif and .png format. More are available via Wikimedia Commons.
+
+  - Individual images can be fetched at:
+      `https://en.wikipedia.org/w/api.php?action=query&prop=imageinfo&iiprop=extmetadata&titles=File:{identifier}&format=json`
+
+
+- **Library of Congress**: Structured, machine-readable representations of authoritative bibliographic, subject, and name data.
+  - Dump files do not have a specified update frequency.
+  - LUX harvests the following datasets:
+    - [NAF (Name Authority File)](https://id.loc.gov/download/authorities/names.madsrdf.jsonld.gz)
+      - Linked.art Class: Person, Group, Place, Activity, Period
+    - [SH (Subject Headings)](https://id.loc.gov/download/authorities/subjects.madsrdf.jsonld.gz) 
+      - Linked.art Class: Concept 
+    - [DGT (Demographic Group Terms)](https://id.loc.gov/download/authorities/demographicTerms.madsrdf.jsonld.gz) 
+      - Linked.art Class: Concept
+
+  - As of 2025, LUX prefers the Activity Stream for NAF and SH.
+    - `https://id.loc.gov/authorities/names/activitystreams/feed/1`
+    - `https://id.loc.gov/authorities/subjects/activitystreams/feed/1`
+
+  - Format: JSON-LD/MADS/RDF
+
+  - Individual records can be fetched at (e.g.):
+    `http://id.loc.gov/authorities/names/{identifier}.json`
+
+
+- **ORCID**: Unique, persistent identifier system for researchers and scholars.
+  - Dump file is updated yearly in October.
+  - LUX harvests the following dataset:
+    - [2024 Summaries](https://orcid.figshare.com/ndownloader/files/49560102)
+      - Linked.art Class: Person
+
+  - Format: XML
+
+  - Individual records can be fetched via Orcid's API, but LUX relies solely on the dump file.
+
+- **ROR (Research Organization Registry)**: Global, open, and community-driven registry of unique identifiers for research organizations.
+  - Dump file is updated monthly.
+  - LUX harvests the following dataset:
+    - [ROR Data](https://zenodo.org/records/14429114/files/v1.58-2024-12-11-ror-data.zip)
+      - Linked.art Class: Group
+
+  - Format: JSON
+
+  - Individual records can be fetched at:
+    `https://api.ror.org/organizations/{identifier}`
+
+- **VIAF (Virtual International Authority File)**: International service that consolidates and links authority data for names of people, organizations, and more, from libraries and cultural institutions worldwide.
+  - Dump files are typically updated monthly. However, as of August 2024, updating is on pause while VIAF undergoes security and production environment improvements.
+  - LUX harvests the following dataset:
+    - [VIAF Clusters](https://viaf.org/viaf/data/viaf-20240804-clusters.xml.gz)
+      - Linked.art Class: Person, Group, Place
+
+  - Format: XML
+
+  - Individual records can be fetched at:
+    `https://viaf.org/viaf/{identifier}/viaf.xml`
+
+
+- **Who’s on First (WOF)**: Open-source gazetteer and database of geographic places, providing unique identifiers and metadata for locations worldwide.
+  - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset.
+  - LUX harvests the following dataset:
+    - [WOF Global Latest](https://data.geocode.earth/wof/dist/sqlite/whosonfirst-data-admin-latest.db.bz2)
+      - Linked.art Class: Place
+
+  - Format: SQLite database
+
+  - Individual records can be fetched at:
+    `https://data.whosonfirst.org/{identifier}`
+
+- **SNAC (Social Networks and Archival Context)**: Cooperative initiative to discover biographical and historical information about people, families, and organizations, connecting them through archival records.
+  - No dump file available. LUX fetches records as they are referenced. 
+      - Linked.art Class: Person, Group
+
+  - Format: JSON
+
+  - Individual records can be fetched at:
+    `https://snaccooperative.org/download?arkid=http://n2t.net/ark:/99166/{identifier}&type=constellation_json`
+
+
+- **GBIF (Global Biodiversity Information Facility)**: International network and data platform that provides open access to biodiversity data, enabling research on species distribution and ecosystems worldwide.
+  - No dump file of the entire dataset is available. LUX fetches records as they are referenced, usually from Yale Peabody Museum taxonomic records.
+    - Linked.art Class: Concept
+
+  - Format: JSON
+
+  - Individual records can be fetched at:
+    `https://api.gbif.org/v1/species/{identifier}`
+
+- **Homosaurus**: International LGBTQ+ linked data vocabulary that provides standardized terms to improve the discovery and organization of LGBTQ+ resources in libraries, archives, and other information systems.
+  - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset.
+  - LUX harvests the following dataset:
+    - [V3](https://homosaurus.org/v3.jsonld)
+      - Linked.art Class: Concept
+
+  - Format: JSON-LD
+
+  - Individual records can be fetched at:
+    `https://homosaurus.org/v3/{identifier}.jsonld`
+
+- **Nomisma**: Collaborative project that provides a linked open data vocabulary and digital resource for numismatics, focusing on the study of coins, currency, and related objects.
+  - Dump files are updated nightly.
+  - LUX harvests the following dataset:
+    - [Nomisma](https://nomisma.org/nomisma.org.jsonld)
+      - Linked.art Class: Person, Group, Place, Concept
+
+  - Format: JSON-LD
+
+  - Individual records can be fetched at:
+    `http://nomisma.org/id/{identifier}.jsonld`
+
+
+- **BNE (Biblioteca Nacional de España)**: National Library of Spain, which provides access to Spain's cultural and historical heritage through its collection of books, manuscripts, maps, and digital resources.
+  - Dump files do not have a specified update frequency, but the webpage includes the upload date for each dataset.
+  - LUX harvests the following datasets:
+    - [Entidad](https://www.bne.es/media/datosgob/catalogo-autoridades/entidad/entidad-JSON.zip)
+      - Linked.art Class: Group
+    - [Materia](https://www.bne.es/media/datosgob/catalogo-autoridades/materia/materia-JSON.zip)
+      - Linked.art Class: Concept
+    - [Geografico](https://www.bne.es/media/datosgob/catalogo-autoridades/geografico/geografico-JSON.zip)
+      - Linked.art Class: Place
+    - [Persona](https://www.bne.es/media/datosgob/catalogo-autoridades/persona/persona-JSON.zip)
+      - Linked.art Class: Person
+
+    - Format: JSON
+
+    - Individual records can be fetched at:
+      `https://datos.bne.es/resource/{identifier}.jsonld`
+
+- **BNF (Bibliothèque nationale de France)**: National library of France, preserving and providing access to a vast collection of books, manuscripts, and cultural heritage materials.
+  - In the past, LUX relied on the BNF's RDF/JSON-LD for harvesting, however this service has not been consistently available. As a result, we swapped to the XML dump files.
+  - Dump files do not have a specified update frequency.
+  - LUX harvests the following datasets:
+    - [DataBNF Rameau NoSubjects](https://transfert.bnf.fr/link/c26ba50e-17c4-46fe-b6d8-8c2ad393f40e)
+      - Linked.art Class: Concept
+    - [DataBNF Person Authors](https://transfert.bnf.fr/link/c412f451-2bf2-45a7-b76b-a11d563c2a8a)
+      - Linked.art Class: People
+    - [DataBNF Org Authors](https://transfert.bnf.fr/link/2a2b3690-f642-4644-8615-9b50b59c84d9)
+      - Linked.art Class: Group
+    - [DataBNF Geos](https://transfert.bnf.fr/link/86ea06b4-2590-4d1c-8e1e-126eff24b535)
+      - Linked.art Class: Place
+
+    - Format: XML* see note about RDF/JSON-LD above
+
+    - If service is available, individual records can be fetched at:
+      `https://data.bnf.fr/ark:/12148/{identifier}.rdfjsonld`
+
+
+- **Japan NDL (Japanese National Diet Library)**: Provides access to a wide range of bibliographic and authority data, enabling researchers and institutions to retrieve and utilize information from the NDL's extensive collections.
+  - While dump files are available for subject headings, LUX retrieves records as referenced.
+
+  - Format: JSON-LD
+
+  - Individual records can be fetched at, e.g.:
+    `https://id.ndl.go.jp/auth/ndlsh/{identifier}.json`
+
diff --git a/lux_pipeline/process/_task_ui_manager.py b/lux_pipeline/process/_task_ui_manager.py
index 6e7aa116..2fe8fd56 100644
--- a/lux_pipeline/process/_task_ui_manager.py
+++ b/lux_pipeline/process/_task_ui_manager.py
@@ -7,6 +7,8 @@
 import logging
 logger = logging.getLogger("lux_pipeline")
 import traceback
+import os
+
 
 class TaskLogHandler(logging.Handler):
     def __init__(self, manager):
diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py
index a4b02aef..309e78bb 100644
--- a/lux_pipeline/process/base/loader.py
+++ b/lux_pipeline/process/base/loader.py
@@ -1,16 +1,15 @@
-
+import re
 import io
 import os
 import requests
 import shutil
 import time
 import gzip
-import bz2
 import zipfile
 import tarfile
 import ujson as json
 import logging
-logger = logging.getLogger("lux_pipeline")
+
 try:
     import magic
 except:
@@ -233,7 +232,7 @@ def iterate_tar(self, path, comp, remaining):
             mode = "r"
         with tarfile.open(path, mode) as th:
             if self.increment_total and len(remaining) == 1:
-                names = th.namelist()
+                names = th.getnames()
                 self.update_progress_bar(increment_total=len(names))
                 del names
             ti = th.next()
@@ -373,6 +372,7 @@ def make_json(self, path, comp, parent):
         return {'identifier': ident, 'data': data}
 
     def make_other(self, path, comp, parent):
+        print(f"path is {path}")
         ident = self.make_identifier(path)
         with self.file_opener(path, comp) as fh:
             data = fh.read()
@@ -393,6 +393,11 @@ def make_identifier(self, value):
             value = value.name
         elif isinstance(value, bytes):
             value = value.decode('utf-8')
+
+        # end of file name is invalid for xml files
+        if isinstance(value, str) and value.endswith(".xml"):
+            return None
+
         try:
             last = value.split('/')[-1]
             return last.split('.')[0]
@@ -411,7 +416,30 @@ def post_process_json(self, data):
 
     def post_process_other(self, data):
         # This is called after discovering the record and before extracting the identifier
-        return data
+        if isinstance(data, bytes):
+            try:
+                data = data.decode("utf-8")
+            except UnicodeDecodeError:
+                data = data.decode("utf-8", errors="replace")
+
+        match = re.search(
+            r'''
+            rdf:about="https?://data\.bnf\.fr/ark:/12148/(?P<bnf>cb\d{9})"(?!"|#) |
+            <mx:controlfield\s+tag="001">(?P<fast>[^<]+)</mx:controlfield>
+            ''',
+            data,
+            re.VERBOSE,
+        )
+
+        if match:
+            ident = match.group("bnf") or match.group("fast")
+            print(f"ident is {ident}")
+
+        result = {"raw": data}
+        if ident:
+            result["id"] = ident
+
+        return result
 
     def should_make_record(self, path):
         if self.max_slice > 1 and self.seen % self.max_slice != self.my_slice:
@@ -463,7 +491,7 @@ def store_record(self, record):
         try:
             self.out_cache[identifier] = data
         except Exception as e:
-            logger.error(e)
+            print(e)
             return False
         self.increment_progress_bar(1)
         return True
diff --git a/lux_pipeline/process/download_manager.py b/lux_pipeline/process/download_manager.py
index 6a8ac950..1e0b7fa8 100644
--- a/lux_pipeline/process/download_manager.py
+++ b/lux_pipeline/process/download_manager.py
@@ -1,4 +1,4 @@
-
+import logging
 from ._task_ui_manager import TaskUiManager
 from lux_pipeline.cli._rich import get_bar_from_layout
 
diff --git a/lux_pipeline/sources/dnb/loader.py b/lux_pipeline/sources/dnb/loader.py
index cd3ca1d7..988058be 100644
--- a/lux_pipeline/sources/dnb/loader.py
+++ b/lux_pipeline/sources/dnb/loader.py
@@ -26,7 +26,6 @@ def iterate_sachbegriff(self, path, comp, parent):
 
 
 class OldDnbLoader:
-
     def __init__(self, config):
         Loader.__init__(self, config)
         d = config['all_configs'].data_dir

From 4ee5d820d9c6de7654baef7825597bb357d435e0 Mon Sep 17 00:00:00 2001
From: Kelly Davis <kelly.davis@yale.edu>
Date: Fri, 4 Apr 2025 14:45:31 -0400
Subject: [PATCH 2/3] removed print statements, added back logging

---
 lux_pipeline/process/base/loader.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py
index 309e78bb..38c447b2 100644
--- a/lux_pipeline/process/base/loader.py
+++ b/lux_pipeline/process/base/loader.py
@@ -5,11 +5,14 @@
 import shutil
 import time
 import gzip
+import bz2
 import zipfile
 import tarfile
 import ujson as json
 import logging
 
+logger = logging.getLogger("lux_pipeline")
+
 try:
     import magic
 except:
@@ -372,7 +375,6 @@ def make_json(self, path, comp, parent):
         return {'identifier': ident, 'data': data}
 
     def make_other(self, path, comp, parent):
-        print(f"path is {path}")
         ident = self.make_identifier(path)
         with self.file_opener(path, comp) as fh:
             data = fh.read()
@@ -433,7 +435,6 @@ def post_process_other(self, data):
 
         if match:
             ident = match.group("bnf") or match.group("fast")
-            print(f"ident is {ident}")
 
         result = {"raw": data}
         if ident:
@@ -491,7 +492,7 @@ def store_record(self, record):
         try:
             self.out_cache[identifier] = data
         except Exception as e:
-            print(e)
+            logger.error(e)
             return False
         self.increment_progress_bar(1)
         return True

From eb65d85422d1a045245034b7d68b5094354525ea Mon Sep 17 00:00:00 2001
From: Kelly Davis <kelly.davis@yale.edu>
Date: Mon, 14 Apr 2025 10:54:40 -0400
Subject: [PATCH 3/3] removed xml handling from base loader, created bnf loader

---
 lux_pipeline/process/base/loader.py | 24 +------------
 lux_pipeline/sources/bnf/loader.py  | 53 +++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 23 deletions(-)
 create mode 100644 lux_pipeline/sources/bnf/loader.py

diff --git a/lux_pipeline/process/base/loader.py b/lux_pipeline/process/base/loader.py
index d9dc656f..5c601302 100644
--- a/lux_pipeline/process/base/loader.py
+++ b/lux_pipeline/process/base/loader.py
@@ -418,29 +418,7 @@ def post_process_json(self, data):
 
     def post_process_other(self, data):
         # This is called after discovering the record and before extracting the identifier
-        if isinstance(data, bytes):
-            try:
-                data = data.decode("utf-8")
-            except UnicodeDecodeError:
-                data = data.decode("utf-8", errors="replace")
-
-        match = re.search(
-            r'''
-            rdf:about="https?://data\.bnf\.fr/ark:/12148/(?P<bnf>cb\d{9})"(?!"|#) |
-            <mx:controlfield\s+tag="001">(?P<fast>[^<]+)</mx:controlfield>
-            ''',
-            data,
-            re.VERBOSE,
-        )
-
-        if match:
-            ident = match.group("bnf") or match.group("fast")
-
-        result = {"raw": data}
-        if ident:
-            result["id"] = ident
-
-        return result
+        return data
 
     def should_make_record(self, path):
         if self.max_slice > 1 and self.seen % self.max_slice != self.my_slice:
diff --git a/lux_pipeline/sources/bnf/loader.py b/lux_pipeline/sources/bnf/loader.py
new file mode 100644
index 00000000..d1b2494f
--- /dev/null
+++ b/lux_pipeline/sources/bnf/loader.py
@@ -0,0 +1,53 @@
+import re
+import logging
+from lux_pipeline.process.base.loader import Loader
+
+logger = logging.getLogger("lux_pipeline")
+
+
+class BnfLoader(Loader):
+    def __init__(self, config):
+        super().__init__(config)
+        self._temp_records = {}
+
+    def extract_identifier(self, data):
+        if isinstance(data, bytes):
+            try:
+                data = data.decode("utf-8")
+            except UnicodeDecodeError:
+                data = data.decode("utf-8", errors="replace")
+
+        match = re.search(r'https?://data\.bnf\.fr/ark:/12148/([^"#<>\s]+)', data)
+        if match:
+            return match.group(1)
+        else:
+            logger.warning(f"BNF loader can't find an identifier for {data[:200]}...")
+            return None
+
+    def post_process_other(self, data):
+        return {'raw': data}  # Wrap raw XML as-is
+
+    def store_record(self, record):
+        # Collect all records by identifier, defer storing
+        ident = record["identifier"]
+        if ident not in self._temp_records:
+            self._temp_records[ident] = []
+        self._temp_records[ident].append(record["data"])
+        return True  # Don't store in out_cache yet
+
+    def load(self, disable_ui=False, overwrite=True):
+        # Run base loading (parsing + buffering only)
+        super().load(disable_ui=disable_ui, overwrite=overwrite)
+
+        # Merge and store final data
+        for ident, records in self._temp_records.items():
+            combined = "\n".join(r["raw"] for r in records)
+            record = {"identifier": ident, "data": {"raw": combined}}
+
+            if self.should_store_record(record):
+                try:
+                    self.out_cache[ident] = record["data"]
+                    self.post_store_record(record)
+                    self.increment_progress_bar(1)
+                except Exception as e:
+                    logger.error(f"Failed to store merged BNF record {ident}: {e}")