Skip to content

Commit fcd47c8

Browse files
feat: working docling strategy for both pdf and image data
1 parent 4653c02 commit fcd47c8

File tree

1 file changed

+9
-7
lines changed

1 file changed

+9
-7
lines changed

text_extract_api/extract/strategies/docling.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
from text_extract_api.extract.extract_result import ExtractResult
22
from text_extract_api.extract.strategies.strategy import Strategy
33
from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
4-
from docling import DoclingDocument # Assuming a compatible Docling library or module
4+
from docling_core.types.doc.document import DoclingDocument # Assuming a compatible Docling library or module
5+
from docling.document_converter import DocumentConverter
56
import tempfile
67

78
class DoclingStrategy(Strategy):
89
"""
910
Extraction strategy for processing PDF documents using Docling.
1011
"""
1112

13+
def name(self) -> str:
14+
return "docling"
15+
1216
def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
1317
"""
1418
Extracts text from a PDF file using Docling and returns an ExtractResult.
@@ -17,16 +21,13 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
1721
:param language: Language of the text (default is 'en').
1822
:return: ExtractResult containing the extracted DoclingDocument and metadata.
1923
"""
20-
if not isinstance(file_format, PdfFileFormat):
21-
raise ValueError("DoclingStrategy only supports PdfFileFormat.")
2224

2325
# Save file content to a temporary file
2426
temp_file_path = self._save_to_temp_file(file_format)
2527

2628
# Convert the document using Docling
2729
docling_document = self._convert_to_docling(temp_file_path)
2830

29-
print(docling_document)
3031
# Return the result wrapped in ExtractResult
3132
return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
3233

@@ -37,7 +38,7 @@ def text_gatherer(self, docling_document: DoclingDocument) -> str:
3738
:param docling_document: Instance of DoclingDocument.
3839
:return: Text content in markdown format.
3940
"""
40-
return docling_document.to_markdown()
41+
return docling_document.export_to_markdown()
4142

4243
def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4344
"""
@@ -48,7 +49,8 @@ def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4849
"""
4950
# Placeholder for actual conversion logic using the Docling API
5051
try:
51-
docling_document = DoclingDocument.from_file(file_path)
52+
converter = DocumentConverter()
53+
docling_document = converter.convert(file_path).document
5254
return docling_document
5355
except Exception as e:
5456
raise RuntimeError(f"Failed to convert document using Docling: {e}")
@@ -61,5 +63,5 @@ def _save_to_temp_file(self, file_format: FileFormat) -> str:
6163
:return: Path to the temporary file containing the file content.
6264
"""
6365
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
64-
temp_file.write(file_format.get_content()) # Assuming get_content provides binary content
66+
temp_file.write(file_format.binary)
6567
return temp_file.name

0 commit comments

Comments
 (0)