1+ import tempfile
2+
3+ from docling .document_converter import DocumentConverter
4+ from docling_core .types .doc .document import ( # Assuming a compatible Docling library or module
5+ DoclingDocument ,
6+ )
7+
18from text_extract_api .extract .extract_result import ExtractResult
29from text_extract_api .extract .strategies .strategy import Strategy
310from text_extract_api .files .file_formats import FileFormat , PdfFileFormat
4- from docling_core .types .doc .document import DoclingDocument # Assuming a compatible Docling library or module
5- import tempfile
6-
711
812class DoclingStrategy (Strategy ):
913 """
1014 Extraction strategy for processing PDF documents using Docling.
1115 """
1216
13- def extract_text (self , file_format : FileFormat , language : str = 'en' ) -> ExtractResult :
17+ def name (self ) -> str :
18+ return "docling"
19+
20+ def extract_text (
21+ self , file_format : FileFormat , language : str = "en"
22+ ) -> ExtractResult :
1423 """
1524 Extracts text from a PDF file using Docling and returns an ExtractResult.
1625
1726 :param file_format: Instance of FileFormat (only supports PdfFileFormat).
1827 :param language: Language of the text (default is 'en').
1928 :return: ExtractResult containing the extracted DoclingDocument and metadata.
2029 """
21- if not isinstance (file_format , PdfFileFormat ):
22- raise ValueError ("DoclingStrategy only supports PdfFileFormat." )
2330
2431 # Save file content to a temporary file
2532 temp_file_path = self ._save_to_temp_file (file_format )
2633
2734 # Convert the document using Docling
2835 docling_document = self ._convert_to_docling (temp_file_path )
2936
30- print (docling_document )
3137 # Return the result wrapped in ExtractResult
3238 return ExtractResult (value = docling_document , text_gatherer = self .text_gatherer )
3339
@@ -38,7 +44,7 @@ def text_gatherer(self, docling_document: DoclingDocument) -> str:
3844 :param docling_document: Instance of DoclingDocument.
3945 :return: Text content in markdown format.
4046 """
41- return docling_document .to_markdown ()
47+ return docling_document .export_to_markdown ()
4248
4349 def _convert_to_docling (self , file_path : str ) -> DoclingDocument :
4450 """
@@ -49,7 +55,8 @@ def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4955 """
5056 # Placeholder for actual conversion logic using the Docling API
5157 try :
52- docling_document = DoclingDocument .from_file (file_path )
58+ converter = DocumentConverter ()
59+ docling_document = converter .convert (file_path ).document
5360 return docling_document
5461 except Exception as e :
5562 raise RuntimeError (f"Failed to convert document using Docling: { e } " )
@@ -61,6 +68,6 @@ def _save_to_temp_file(self, file_format: FileFormat) -> str:
6168 :param file_format: Instance of FileFormat.
6269 :return: Path to the temporary file containing the file content.
6370 """
64- with tempfile .NamedTemporaryFile (delete = False , suffix = ' .pdf' ) as temp_file :
65- temp_file .write (file_format .get_content ()) # Assuming get_content provides binary content
71+ with tempfile .NamedTemporaryFile (delete = False , suffix = " .pdf" ) as temp_file :
72+ temp_file .write (file_format .binary )
6673 return temp_file .name
0 commit comments