11from text_extract_api .extract .extract_result import ExtractResult
22from text_extract_api .extract .strategies .strategy import Strategy
33from text_extract_api .files .file_formats import FileFormat , PdfFileFormat
4- from docling import DoclingDocument # Assuming a compatible Docling library or module
4+ from docling_core .types .doc .document import DoclingDocument # Assuming a compatible Docling library or module
5+ from docling .document_converter import DocumentConverter
56import tempfile
67
78class DoclingStrategy (Strategy ):
89 """
910 Extraction strategy for processing PDF documents using Docling.
1011 """
1112
13+ def name (self ) -> str :
14+ return "docling"
15+
1216 def extract_text (self , file_format : FileFormat , language : str = 'en' ) -> ExtractResult :
1317 """
1418 Extracts text from a PDF file using Docling and returns an ExtractResult.
@@ -17,16 +21,13 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
1721 :param language: Language of the text (default is 'en').
1822 :return: ExtractResult containing the extracted DoclingDocument and metadata.
1923 """
20- if not isinstance (file_format , PdfFileFormat ):
21- raise ValueError ("DoclingStrategy only supports PdfFileFormat." )
2224
2325 # Save file content to a temporary file
2426 temp_file_path = self ._save_to_temp_file (file_format )
2527
2628 # Convert the document using Docling
2729 docling_document = self ._convert_to_docling (temp_file_path )
2830
29- print (docling_document )
3031 # Return the result wrapped in ExtractResult
3132 return ExtractResult (value = docling_document , text_gatherer = self .text_gatherer )
3233
@@ -37,7 +38,7 @@ def text_gatherer(self, docling_document: DoclingDocument) -> str:
3738 :param docling_document: Instance of DoclingDocument.
3839 :return: Text content in markdown format.
3940 """
40- return docling_document .to_markdown ()
41+ return docling_document .export_to_markdown ()
4142
4243 def _convert_to_docling (self , file_path : str ) -> DoclingDocument :
4344 """
@@ -48,7 +49,8 @@ def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4849 """
4950 # Placeholder for actual conversion logic using the Docling API
5051 try :
51- docling_document = DoclingDocument .from_file (file_path )
52+ converter = DocumentConverter ()
53+ docling_document = converter .convert (file_path ).document
5254 return docling_document
5355 except Exception as e :
5456 raise RuntimeError (f"Failed to convert document using Docling: { e } " )
@@ -61,5 +63,5 @@ def _save_to_temp_file(self, file_format: FileFormat) -> str:
6163 :return: Path to the temporary file containing the file content.
6264 """
6365 with tempfile .NamedTemporaryFile (delete = False , suffix = '.pdf' ) as temp_file :
64- temp_file .write (file_format .get_content ()) # Assuming get_content provides binary content
66+ temp_file .write (file_format .binary )
6567 return temp_file .name
0 commit comments