@@ -225,14 +225,24 @@ def split_documents(documents=None, text_documents_pdf=None):
225225 text_splitter = RecursiveCharacterTextSplitter (
226226 chunk_size = chunk_size ,
227227 chunk_overlap = chunk_overlap ,
228- keep_separator = False ,
228+ keep_separator = False ,
229229 )
230230
231231 texts = []
232232
233233 # Split non-PDF documents (no cleanup needed - handled in load_single_document)
234234 if documents :
235+ # Ensure all input documents have string content before splitting
236+ for doc in documents :
237+ if not isinstance (doc .page_content , str ):
238+ doc .page_content = str (doc .page_content or "" )
239+
235240 texts = text_splitter .split_documents (documents )
241+
242+ # Ensure all split documents have string content
243+ for text_doc in texts :
244+ if not isinstance (text_doc .page_content , str ):
245+ text_doc .page_content = str (text_doc .page_content or "" )
236246
237247 """
238248 I customized langchain's pymupdfparser to add custom page markers as follows:
@@ -250,6 +260,11 @@ def split_documents(documents=None, text_documents_pdf=None):
250260 # 2. Split PDF documents (with custom page markers) #
251261 # ------------------------------------------------------------------ #
252262 if text_documents_pdf :
263+ # Ensure all PDF documents have string content before processing
264+ for pdf_doc in text_documents_pdf :
265+ if not isinstance (pdf_doc .page_content , str ):
266+ pdf_doc .page_content = str (pdf_doc .page_content or "" )
267+
253268 processed_pdf_docs = []
254269 for doc in text_documents_pdf :
255270 chunked_docs = add_pymupdf_page_metadata (
@@ -258,16 +273,26 @@ def split_documents(documents=None, text_documents_pdf=None):
258273 chunk_overlap = chunk_overlap ,
259274 )
260275 processed_pdf_docs .extend (chunked_docs )
276+
277+ # Ensure all PDF chunks have string content
278+ for pdf_doc in processed_pdf_docs :
279+ if not isinstance (pdf_doc .page_content , str ):
280+ pdf_doc .page_content = str (pdf_doc .page_content or "" )
281+
261282 texts .extend (processed_pdf_docs )
262283
284+ # Final safety check: ensure ALL texts have string content
285+ for text_doc in texts :
286+ if not isinstance (text_doc .page_content , str ):
287+ text_doc .page_content = str (text_doc .page_content or "" )
288+
263289 return texts
264290
265291 except Exception as e :
266292 logging .exception ("Error during document splitting" )
267293 logging .error (f"Error type: { type (e )} " )
268294 raise
269295
270-
271296"""
272297The PyMUPDF parser was modified in langchain-community 0.3.15+
273298
0 commit comments