@@ -103,13 +103,11 @@ def load_single_document(file_path: Path) -> Document:
103103 "strategy" : "fast"
104104 }
105105 })
106-
107106 elif file_extension == ".pdf" :
108107 loader_options .update ({
109108 "extract_images" : False ,
110109 "text_kwargs" : {}, # Optional: passed to https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text
111110 })
112-
113111 elif file_extension in [".eml" , ".msg" ]:
114112 loader_options .update ({
115113 "mode" : "single" ,
@@ -124,10 +122,9 @@ def load_single_document(file_path: Path) -> Document:
124122 "bs_kwargs" : {
125123 "features" : "lxml" , # Specify the parser to use (lxml is generally fast and lenient)
126124 # "parse_only": SoupStrainer("body"), # Optionally parse only the body tag
127- # "from_encoding": "iso-8859-1 ", # Specify a different input encoding if needed
125+ # "from_encoding": "utf-8 ", # Specify a different input encoding if needed
128126 },
129127 "get_text_separator" : "\n " , # Use newline as separator when extracting text
130- # Additional parameters and comments:
131128 # "file_path": "path/to/file.html", # Usually set automatically by the loader
132129 # "open_encoding": None, # Set to None to let BeautifulSoup detect encoding
133130 # "get_text_separator": " ", # Use space instead of newline if preferred
@@ -219,7 +216,7 @@ def split_documents(documents=None, text_documents_pdf=None):
219216 text_splitter = RecursiveCharacterTextSplitter (
220217 chunk_size = chunk_size ,
221218 chunk_overlap = chunk_overlap ,
222- length_function = lambda x : len (x ) if isinstance (x , str ) else len (str (x )),
219+ # length_function=lambda x: len(x) if isinstance(x, str) else len(str(x)),
223220 keep_separator = False ,
224221 )
225222
@@ -303,7 +300,6 @@ def split_documents(documents=None, text_documents_pdf=None):
303300 raise
304301
305302
306-
307303"""
308304The PyMUPDF parser was modified in langchain-community 0.3.15+
309305
0 commit comments