+ docling file format handler

choinek · choinek · commit 7e71df6b0af7 · 2025-04-26T09:33:03.000+02:00
+ suppport for docx, txt, html,plain, csv, json, xml
+ few fixes
diff --git a/examples/example-word-lorem.docx b/examples/example-word-lorem.docx
diff --git a/text_extract_api/extract/strategies/docling.py b/text_extract_api/extract/strategies/docling.py
@@ -21,9 +21,9 @@ def extract_text(
         self, file_format: FileFormat, language: str = "en"
     ) -> ExtractResult:
         """
-        Extracts text from a PDF file using Docling and returns an ExtractResult.
+        Extracts text from a file using Docling and returns an ExtractResult.
 
-        :param file_format: Instance of FileFormat (only supports PdfFileFormat).
+        :param file_format: Instance of FileFormat (which supports most docling formats).
         :param language: Language of the text (default is 'en').
         :return: ExtractResult containing the extracted DoclingDocument and metadata.
         """
diff --git a/text_extract_api/extract/strategies/ollama.py b/text_extract_api/extract/strategies/ollama.py
@@ -2,7 +2,8 @@
 import tempfile
 import time
 
-import ollama
+import httpx
+from ollama import Client
 
 from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
@@ -26,7 +27,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
             raise TypeError(
                 f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
             )
-
         images = FileFormat.convert_to(file_format, ImageFileFormat)
         extracted_text = ""
         start_time = time.time()
@@ -38,9 +38,10 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 temp_file.write(image.binary)
                 temp_filename = temp_file.name
 
-            print(self._strategy_config)
             # Generate text using the specified model
             try:
+                timeout = httpx.Timeout(connect=180.0, read=180.0, write=180.0, pool=180.0)
+                ollama = Client(timeout=timeout)
                 response = ollama.chat(self._strategy_config.get('model'), [{
                     'role': 'user',
                     'content': self._strategy_config.get('prompt'),
diff --git a/text_extract_api/files/file_formats/docling.py b/text_extract_api/files/file_formats/docling.py
@@ -0,0 +1,47 @@
+from typing import Type, Dict, Callable, Iterator
+from text_extract_api.files.file_formats.file_format import FileFormat
+
+
+class DoclingFileFormat(FileFormat):
+    DEFAULT_FILENAME: str = "document.docling"
+    DEFAULT_MIME_TYPE: str = "application/vnd.docling"
+
+    @staticmethod
+    def accepted_mime_types() -> list[str]:
+        return [
+            "application/pdf",  # PDF documents
+            "application/vnd.docling",  # Docling documents
+            "text/plain",
+            "text/markdown",
+            "text/html",  # HTML documents
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.oasis.opendocument.text",
+            "application/vnd.ms-excel",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            "application/vnd.ms-powerpoint",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            "image/jpeg",
+            "image/png",
+            "text/csv",
+            "application/json",
+            "application/xml",
+        ]
+
+    @staticmethod
+    def is_pageable() -> bool:
+        return True
+
+    @classmethod
+    def default_iterator_file_format(cls) -> Type[FileFormat]:
+        return cls
+
+    @staticmethod
+    def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
+        # No specific converters needed as the strategy will handle conversion
+        return {}
+
+    @staticmethod
+    def validate(binary_file_content: bytes):
+        if not binary_file_content or len(binary_file_content) == 0:
+            raise ValueError("Empty file content")
diff --git a/text_extract_api/files/file_formats/file_format.py b/text_extract_api/files/file_formats/file_format.py
@@ -65,6 +65,8 @@ def from_binary(
             filename: Optional[str] = None,
             mime_type: Optional[str] = None
     ) -> Type["FileFormat"]:
+        if mime_type == "application/octet-stream":
+            mime_type = None
         mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
         from text_extract_api.files.file_formats.pdf import PdfFileFormat  # type: ignore
         file_format_class = cls._get_file_format_class(mime_type)
@@ -196,6 +198,7 @@ def unify(self) -> "FileFormat":
     def _get_file_format_class(mime_type: str) -> Type["FileFormat"]:
         import text_extract_api.files.file_formats.pdf  # noqa - its not unused import @todo autodiscover
         import text_extract_api.files.file_formats.image  # noqa - its not unused import @todo autodiscover
+        import text_extract_api.files.file_formats.docling  # noqa - its not unused import @todo autodiscover
         for subclass in FileFormat.__subclasses__():
             if mime_type in subclass.accepted_mime_types():
                 return subclass