Merge pull request #115 from btrojan-official/feature/54-add-docling-support-btrojan

choinek · web-flow · commit 15d9bb871951 · 2025-04-16T05:47:46.000+02:00
Feature/54 add docling support btrojan
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "google-auth-httplib2",
     "google-auth-oauthlib",
     "transformers",
+    "accelerate",
     "boto3",
     "Pillow",
     "python-magic==0.4.27",
diff --git a/text_extract_api/extract/strategies/docling.py b/text_extract_api/extract/strategies/docling.py
@@ -1,33 +1,39 @@
+import tempfile
+
+from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import (  # Assuming a compatible Docling library or module
+    DoclingDocument,
+)
+
 from text_extract_api.extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
-from docling_core.types.doc.document import DoclingDocument  # Assuming a compatible Docling library or module
-import tempfile
-
 
 class DoclingStrategy(Strategy):
     """
     Extraction strategy for processing PDF documents using Docling.
     """
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
+    def name(self) -> str:
+        return "docling"
+
+    def extract_text(
+        self, file_format: FileFormat, language: str = "en"
+    ) -> ExtractResult:
         """
         Extracts text from a PDF file using Docling and returns an ExtractResult.
 
         :param file_format: Instance of FileFormat (only supports PdfFileFormat).
         :param language: Language of the text (default is 'en').
         :return: ExtractResult containing the extracted DoclingDocument and metadata.
         """
-        if not isinstance(file_format, PdfFileFormat):
-            raise ValueError("DoclingStrategy only supports PdfFileFormat.")
 
         # Save file content to a temporary file
         temp_file_path = self._save_to_temp_file(file_format)
 
         # Convert the document using Docling
         docling_document = self._convert_to_docling(temp_file_path)
 
-        print(docling_document)
         # Return the result wrapped in ExtractResult
         return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
 
@@ -38,7 +44,7 @@ def text_gatherer(self, docling_document: DoclingDocument) -> str:
         :param docling_document: Instance of DoclingDocument.
         :return: Text content in markdown format.
         """
-        return docling_document.to_markdown()
+        return docling_document.export_to_markdown()
 
     def _convert_to_docling(self, file_path: str) -> DoclingDocument:
         """
@@ -49,7 +55,8 @@ def _convert_to_docling(self, file_path: str) -> DoclingDocument:
         """
         # Placeholder for actual conversion logic using the Docling API
         try:
-            docling_document = DoclingDocument.from_file(file_path)
+            converter = DocumentConverter()
+            docling_document = converter.convert(file_path).document
             return docling_document
         except Exception as e:
             raise RuntimeError(f"Failed to convert document using Docling: {e}")
@@ -61,6 +68,6 @@ def _save_to_temp_file(self, file_format: FileFormat) -> str:
         :param file_format: Instance of FileFormat.
         :return: Path to the temporary file containing the file content.
         """
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
-            temp_file.write(file_format.get_content())  # Assuming get_content provides binary content
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(file_format.binary)
             return temp_file.name