[feat]: enhance DoclingStrategy with PDF extraction and temporary file handling

choinek · choinek · commit 54a97cd4f22c · 2025-03-27T02:54:49.000+01:00
diff --git a/text_extract_api/extract/strategies/docling.py b/text_extract_api/extract/strategies/docling.py
@@ -1,54 +1,65 @@
-import tempfile
-
-from typing import Optional
-from docling_core.types.doc.document import DoclingDocument
-
-from docling_parse.docling_parse import pdf_parser_v2
-
+from text_extract_api.extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
-from text_extract_api.files.file_formats.file_format import FileFormat
+from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
+from docling import DoclingDocument  # Assuming a compatible Docling library or module
+import tempfile
 
 class DoclingStrategy(Strategy):
-
-    def __init__(self):
-        super().__init__()
-        self._document: Optional[DoclingDocument] = None
-        self._current_file_format: Optional[FileFormat] = None
-        self._parser = pdf_parser_v2("error")  # @todo move it to construct
-
-
-    @property
-    def document(self) -> Optional[DoclingDocument]:
-        """Access the current DoclingDocument instance"""
-        return self._document
-
-    @classmethod
-    def name(cls) -> str:
-        return "docling"
-
-def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
-
-
-    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
-        temp_file.write(image.binary)
-        temp_filename = temp_file.name
-
-        doc_file = temp_filename
-        doc_key = f"key={file_format.filename}"
-
-    success = self._parser.load_document(doc_key, doc_file)
-
-    num_pages = self._parser.number_of_pages(doc_key)
-
-    for page in range(0, num_pages):
-
-        json_doc = self._parser.parse_pdf_from_key_on_page(doc_key, page)
-
-        if "pages" not in json_doc:
-            continue
-
-        json_page = json_doc["pages"][0]
-        print(json_page)
-
-
-    self._parser.unload_document(doc_key)
+    """
+    Extraction strategy for processing PDF documents using Docling.
+    """
+
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
+        """
+        Extracts text from a PDF file using Docling and returns an ExtractResult.
+
+        :param file_format: Instance of FileFormat (only supports PdfFileFormat).
+        :param language: Language of the text (default is 'en').
+        :return: ExtractResult containing the extracted DoclingDocument and metadata.
+        """
+        if not isinstance(file_format, PdfFileFormat):
+            raise ValueError("DoclingStrategy only supports PdfFileFormat.")
+
+        # Save file content to a temporary file
+        temp_file_path = self._save_to_temp_file(file_format)
+
+        # Convert the document using Docling
+        docling_document = self._convert_to_docling(temp_file_path)
+
+        print(docling_document)
+        # Return the result wrapped in ExtractResult
+        return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
+
+    def text_gatherer(self, docling_document: DoclingDocument) -> str:
+        """
+        Gathers text content from a DoclingDocument in markdown format.
+
+        :param docling_document: Instance of DoclingDocument.
+        :return: Text content in markdown format.
+        """
+        return docling_document.to_markdown()
+
+    def _convert_to_docling(self, file_path: str) -> DoclingDocument:
+        """
+        Converts a PDF file into a DoclingDocument instance.
+
+        :param file_path: Path to the PDF file to be converted.
+        :return: DoclingDocument instance.
+        """
+        # Placeholder for actual conversion logic using the Docling API
+        try:
+            docling_document = DoclingDocument.from_file(file_path)
+            return docling_document
+        except Exception as e:
+            raise RuntimeError(f"Failed to convert document using Docling: {e}")
+
+    def _save_to_temp_file(self, file_format: FileFormat) -> str:
+        """
+        Saves the content of a FileFormat instance to a temporary file.
+
+        :param file_format: Instance of FileFormat.
+        :return: Path to the temporary file containing the file content.
+        """
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+            temp_file.write(file_format.get_content())  # Assuming get_content provides binary content
+            return temp_file.name
diff --git a/text_extract_api/files/file_formats/__init__.py b/text_extract_api/files/file_formats/__init__.py
@@ -0,0 +1,8 @@
+
+### WARNING
+###    This file is generated dynamically before git commit.
+###    Run ./scripts/dev/gen-file-format-init.sh from repository root.
+
+from .file_format import FileFormat
+from .pdf import PdfFileFormat
+from .image import ImageFileFormat
diff --git a/text_extract_api/main.py b/text_extract_api/main.py
@@ -19,7 +19,6 @@
 # Define base path as text_extract_api - required for keeping absolute namespaces
 sys.path.insert(0, str(pathlib.Path(__file__).parent.resolve()))
 
-
 def storage_profile_exists(profile_name: str) -> bool:
     profile_path = os.path.abspath(
         os.path.join(os.getenv('STORAGE_PROFILE_PATH', './storage_profiles'), f'{profile_name}.yaml'))
@@ -29,13 +28,11 @@ def storage_profile_exists(profile_name: str) -> bool:
         return os.path.isfile(sub_profile_path)
     return True
 
-
 app = FastAPI()
 # Connect to Redis
 redis_url = os.getenv('REDIS_CACHE_URL', 'redis://redis:6379/1')
 redis_client = redis.StrictRedis.from_url(redis_url)
 
-
 @app.post("/ocr")
 async def ocr_endpoint(
         strategy: str = Form(...),