Skip to content

Commit 7e71df6

Browse files
committed
+ docling file format handler
+ suppport for docx, txt, html,plain, csv, json, xml + few fixes
1 parent 15d9bb8 commit 7e71df6

File tree

5 files changed

+56
-5
lines changed

5 files changed

+56
-5
lines changed

examples/example-word-lorem.docx

49 KB
Binary file not shown.

text_extract_api/extract/strategies/docling.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ def extract_text(
2121
self, file_format: FileFormat, language: str = "en"
2222
) -> ExtractResult:
2323
"""
24-
Extracts text from a PDF file using Docling and returns an ExtractResult.
24+
Extracts text from a file using Docling and returns an ExtractResult.
2525
26-
:param file_format: Instance of FileFormat (only supports PdfFileFormat).
26+
:param file_format: Instance of FileFormat (which supports most docling formats).
2727
:param language: Language of the text (default is 'en').
2828
:return: ExtractResult containing the extracted DoclingDocument and metadata.
2929
"""

text_extract_api/extract/strategies/ollama.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import tempfile
33
import time
44

5-
import ollama
5+
import httpx
6+
from ollama import Client
67

78
from extract.extract_result import ExtractResult
89
from text_extract_api.extract.strategies.strategy import Strategy
@@ -26,7 +27,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
2627
raise TypeError(
2728
f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
2829
)
29-
3030
images = FileFormat.convert_to(file_format, ImageFileFormat)
3131
extracted_text = ""
3232
start_time = time.time()
@@ -38,9 +38,10 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
3838
temp_file.write(image.binary)
3939
temp_filename = temp_file.name
4040

41-
print(self._strategy_config)
4241
# Generate text using the specified model
4342
try:
43+
timeout = httpx.Timeout(connect=180.0, read=180.0, write=180.0, pool=180.0)
44+
ollama = Client(timeout=timeout)
4445
response = ollama.chat(self._strategy_config.get('model'), [{
4546
'role': 'user',
4647
'content': self._strategy_config.get('prompt'),
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from typing import Type, Dict, Callable, Iterator
2+
from text_extract_api.files.file_formats.file_format import FileFormat
3+
4+
5+
class DoclingFileFormat(FileFormat):
6+
DEFAULT_FILENAME: str = "document.docling"
7+
DEFAULT_MIME_TYPE: str = "application/vnd.docling"
8+
9+
@staticmethod
10+
def accepted_mime_types() -> list[str]:
11+
return [
12+
"application/pdf", # PDF documents
13+
"application/vnd.docling", # Docling documents
14+
"text/plain",
15+
"text/markdown",
16+
"text/html", # HTML documents
17+
"application/msword",
18+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
19+
"application/vnd.oasis.opendocument.text",
20+
"application/vnd.ms-excel",
21+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
22+
"application/vnd.ms-powerpoint",
23+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
24+
"image/jpeg",
25+
"image/png",
26+
"text/csv",
27+
"application/json",
28+
"application/xml",
29+
]
30+
31+
@staticmethod
32+
def is_pageable() -> bool:
33+
return True
34+
35+
@classmethod
36+
def default_iterator_file_format(cls) -> Type[FileFormat]:
37+
return cls
38+
39+
@staticmethod
40+
def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
41+
# No specific converters needed as the strategy will handle conversion
42+
return {}
43+
44+
@staticmethod
45+
def validate(binary_file_content: bytes):
46+
if not binary_file_content or len(binary_file_content) == 0:
47+
raise ValueError("Empty file content")

text_extract_api/files/file_formats/file_format.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def from_binary(
6565
filename: Optional[str] = None,
6666
mime_type: Optional[str] = None
6767
) -> Type["FileFormat"]:
68+
if mime_type == "application/octet-stream":
69+
mime_type = None
6870
mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
6971
from text_extract_api.files.file_formats.pdf import PdfFileFormat # type: ignore
7072
file_format_class = cls._get_file_format_class(mime_type)
@@ -196,6 +198,7 @@ def unify(self) -> "FileFormat":
196198
def _get_file_format_class(mime_type: str) -> Type["FileFormat"]:
197199
import text_extract_api.files.file_formats.pdf # noqa - its not unused import @todo autodiscover
198200
import text_extract_api.files.file_formats.image # noqa - its not unused import @todo autodiscover
201+
import text_extract_api.files.file_formats.docling # noqa - its not unused import @todo autodiscover
199202
for subclass in FileFormat.__subclasses__():
200203
if mime_type in subclass.accepted_mime_types():
201204
return subclass

0 commit comments

Comments
 (0)