Skip to content

Commit ba3e2a8

Browse files
authored
Add files via upload
1 parent fb603df commit ba3e2a8

File tree

1 file changed

+27
-2
lines changed

1 file changed

+27
-2
lines changed

src/document_processor.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,14 +225,24 @@ def split_documents(documents=None, text_documents_pdf=None):
225225
text_splitter = RecursiveCharacterTextSplitter(
226226
chunk_size=chunk_size,
227227
chunk_overlap=chunk_overlap,
228-
keep_separator = False,
228+
keep_separator=False,
229229
)
230230

231231
texts = []
232232

233233
# Split non-PDF documents (no cleanup needed - handled in load_single_document)
234234
if documents:
235+
# Ensure all input documents have string content before splitting
236+
for doc in documents:
237+
if not isinstance(doc.page_content, str):
238+
doc.page_content = str(doc.page_content or "")
239+
235240
texts = text_splitter.split_documents(documents)
241+
242+
# Ensure all split documents have string content
243+
for text_doc in texts:
244+
if not isinstance(text_doc.page_content, str):
245+
text_doc.page_content = str(text_doc.page_content or "")
236246

237247
"""
238248
I customized langchain's pymupdfparser to add custom page markers as follows:
@@ -250,6 +260,11 @@ def split_documents(documents=None, text_documents_pdf=None):
250260
# 2. Split PDF documents (with custom page markers) #
251261
# ------------------------------------------------------------------ #
252262
if text_documents_pdf:
263+
# Ensure all PDF documents have string content before processing
264+
for pdf_doc in text_documents_pdf:
265+
if not isinstance(pdf_doc.page_content, str):
266+
pdf_doc.page_content = str(pdf_doc.page_content or "")
267+
253268
processed_pdf_docs = []
254269
for doc in text_documents_pdf:
255270
chunked_docs = add_pymupdf_page_metadata(
@@ -258,16 +273,26 @@ def split_documents(documents=None, text_documents_pdf=None):
258273
chunk_overlap=chunk_overlap,
259274
)
260275
processed_pdf_docs.extend(chunked_docs)
276+
277+
# Ensure all PDF chunks have string content
278+
for pdf_doc in processed_pdf_docs:
279+
if not isinstance(pdf_doc.page_content, str):
280+
pdf_doc.page_content = str(pdf_doc.page_content or "")
281+
261282
texts.extend(processed_pdf_docs)
262283

284+
# Final safety check: ensure ALL texts have string content
285+
for text_doc in texts:
286+
if not isinstance(text_doc.page_content, str):
287+
text_doc.page_content = str(text_doc.page_content or "")
288+
263289
return texts
264290

265291
except Exception as e:
266292
logging.exception("Error during document splitting")
267293
logging.error(f"Error type: {type(e)}")
268294
raise
269295

270-
271296
"""
272297
The PyMUPDF parser was modified in langchain-community 0.3.15+
273298

0 commit comments

Comments
 (0)