Skip to content

Commit 15d9bb8

Browse files
authored
Merge pull request #115 from btrojan-official/feature/54-add-docling-support-btrojan
Feature/54 add docling support btrojan
2 parents c1bd8e4 + 18cb5c4 commit 15d9bb8

File tree

2 files changed

+19
-11
lines changed

2 files changed

+19
-11
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies = [
2828
"google-auth-httplib2",
2929
"google-auth-oauthlib",
3030
"transformers",
31+
"accelerate",
3132
"boto3",
3233
"Pillow",
3334
"python-magic==0.4.27",
Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,39 @@
1+
import tempfile
2+
3+
from docling.document_converter import DocumentConverter
4+
from docling_core.types.doc.document import ( # Assuming a compatible Docling library or module
5+
DoclingDocument,
6+
)
7+
18
from text_extract_api.extract.extract_result import ExtractResult
29
from text_extract_api.extract.strategies.strategy import Strategy
310
from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
4-
from docling_core.types.doc.document import DoclingDocument # Assuming a compatible Docling library or module
5-
import tempfile
6-
711

812
class DoclingStrategy(Strategy):
913
"""
1014
Extraction strategy for processing PDF documents using Docling.
1115
"""
1216

13-
def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
17+
def name(self) -> str:
18+
return "docling"
19+
20+
def extract_text(
21+
self, file_format: FileFormat, language: str = "en"
22+
) -> ExtractResult:
1423
"""
1524
Extracts text from a PDF file using Docling and returns an ExtractResult.
1625
1726
:param file_format: Instance of FileFormat (only supports PdfFileFormat).
1827
:param language: Language of the text (default is 'en').
1928
:return: ExtractResult containing the extracted DoclingDocument and metadata.
2029
"""
21-
if not isinstance(file_format, PdfFileFormat):
22-
raise ValueError("DoclingStrategy only supports PdfFileFormat.")
2330

2431
# Save file content to a temporary file
2532
temp_file_path = self._save_to_temp_file(file_format)
2633

2734
# Convert the document using Docling
2835
docling_document = self._convert_to_docling(temp_file_path)
2936

30-
print(docling_document)
3137
# Return the result wrapped in ExtractResult
3238
return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
3339

@@ -38,7 +44,7 @@ def text_gatherer(self, docling_document: DoclingDocument) -> str:
3844
:param docling_document: Instance of DoclingDocument.
3945
:return: Text content in markdown format.
4046
"""
41-
return docling_document.to_markdown()
47+
return docling_document.export_to_markdown()
4248

4349
def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4450
"""
@@ -49,7 +55,8 @@ def _convert_to_docling(self, file_path: str) -> DoclingDocument:
4955
"""
5056
# Placeholder for actual conversion logic using the Docling API
5157
try:
52-
docling_document = DoclingDocument.from_file(file_path)
58+
converter = DocumentConverter()
59+
docling_document = converter.convert(file_path).document
5360
return docling_document
5461
except Exception as e:
5562
raise RuntimeError(f"Failed to convert document using Docling: {e}")
@@ -61,6 +68,6 @@ def _save_to_temp_file(self, file_format: FileFormat) -> str:
6168
:param file_format: Instance of FileFormat.
6269
:return: Path to the temporary file containing the file content.
6370
"""
64-
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
65-
temp_file.write(file_format.get_content()) # Assuming get_content provides binary content
71+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
72+
temp_file.write(file_format.binary)
6673
return temp_file.name

0 commit comments

Comments
 (0)