Skip to content

Commit 2d68e98

Browse files
authored
Add files via upload
1 parent 6a22614 commit 2d68e98

File tree

1 file changed

+2
-6
lines changed

1 file changed

+2
-6
lines changed

src/document_processor.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,11 @@ def load_single_document(file_path: Path) -> Document:
103103
"strategy": "fast"
104104
}
105105
})
106-
107106
elif file_extension == ".pdf":
108107
loader_options.update({
109108
"extract_images": False,
110109
"text_kwargs": {}, # Optional: passed to https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text
111110
})
112-
113111
elif file_extension in [".eml", ".msg"]:
114112
loader_options.update({
115113
"mode": "single",
@@ -124,10 +122,9 @@ def load_single_document(file_path: Path) -> Document:
124122
"bs_kwargs": {
125123
"features": "lxml", # Specify the parser to use (lxml is generally fast and lenient)
126124
# "parse_only": SoupStrainer("body"), # Optionally parse only the body tag
127-
# "from_encoding": "iso-8859-1", # Specify a different input encoding if needed
125+
# "from_encoding": "utf-8", # Specify a different input encoding if needed
128126
},
129127
"get_text_separator": "\n", # Use newline as separator when extracting text
130-
# Additional parameters and comments:
131128
# "file_path": "path/to/file.html", # Usually set automatically by the loader
132129
# "open_encoding": None, # Set to None to let BeautifulSoup detect encoding
133130
# "get_text_separator": " ", # Use space instead of newline if preferred
@@ -219,7 +216,7 @@ def split_documents(documents=None, text_documents_pdf=None):
219216
text_splitter = RecursiveCharacterTextSplitter(
220217
chunk_size=chunk_size,
221218
chunk_overlap=chunk_overlap,
222-
length_function=lambda x: len(x) if isinstance(x, str) else len(str(x)),
219+
# length_function=lambda x: len(x) if isinstance(x, str) else len(str(x)),
223220
keep_separator = False,
224221
)
225222

@@ -303,7 +300,6 @@ def split_documents(documents=None, text_documents_pdf=None):
303300
raise
304301

305302

306-
307303
"""
308304
The PyMUPDF parser was modified in langchain-community 0.3.15+
309305

0 commit comments

Comments
 (0)