Skip to content

Commit 54a97cd

Browse files
committed
[feat]: enhance DoclingStrategy with PDF extraction and temporary file handling
1 parent af0a9bc commit 54a97cd

File tree

3 files changed

+70
-54
lines changed

3 files changed

+70
-54
lines changed
Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,65 @@
1-
import tempfile
2-
3-
from typing import Optional
4-
from docling_core.types.doc.document import DoclingDocument
5-
6-
from docling_parse.docling_parse import pdf_parser_v2
7-
1+
from text_extract_api.extract.extract_result import ExtractResult
82
from text_extract_api.extract.strategies.strategy import Strategy
9-
from text_extract_api.files.file_formats.file_format import FileFormat
3+
from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
4+
from docling import DoclingDocument # Assuming a compatible Docling library or module
5+
import tempfile
106

117
class DoclingStrategy(Strategy):
12-
13-
def __init__(self):
14-
super().__init__()
15-
self._document: Optional[DoclingDocument] = None
16-
self._current_file_format: Optional[FileFormat] = None
17-
self._parser = pdf_parser_v2("error") # @todo move it to construct
18-
19-
20-
@property
21-
def document(self) -> Optional[DoclingDocument]:
22-
"""Access the current DoclingDocument instance"""
23-
return self._document
24-
25-
@classmethod
26-
def name(cls) -> str:
27-
return "docling"
28-
29-
def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
30-
31-
32-
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
33-
temp_file.write(image.binary)
34-
temp_filename = temp_file.name
35-
36-
doc_file = temp_filename
37-
doc_key = f"key={file_format.filename}"
38-
39-
success = self._parser.load_document(doc_key, doc_file)
40-
41-
num_pages = self._parser.number_of_pages(doc_key)
42-
43-
for page in range(0, num_pages):
44-
45-
json_doc = self._parser.parse_pdf_from_key_on_page(doc_key, page)
46-
47-
if "pages" not in json_doc:
48-
continue
49-
50-
json_page = json_doc["pages"][0]
51-
print(json_page)
52-
53-
54-
self._parser.unload_document(doc_key)
8+
"""
9+
Extraction strategy for processing PDF documents using Docling.
10+
"""
11+
12+
def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
13+
"""
14+
Extracts text from a PDF file using Docling and returns an ExtractResult.
15+
16+
:param file_format: Instance of FileFormat (only supports PdfFileFormat).
17+
:param language: Language of the text (default is 'en').
18+
:return: ExtractResult containing the extracted DoclingDocument and metadata.
19+
"""
20+
if not isinstance(file_format, PdfFileFormat):
21+
raise ValueError("DoclingStrategy only supports PdfFileFormat.")
22+
23+
# Save file content to a temporary file
24+
temp_file_path = self._save_to_temp_file(file_format)
25+
26+
# Convert the document using Docling
27+
docling_document = self._convert_to_docling(temp_file_path)
28+
29+
print(docling_document)
30+
# Return the result wrapped in ExtractResult
31+
return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
32+
33+
def text_gatherer(self, docling_document: DoclingDocument) -> str:
34+
"""
35+
Gathers text content from a DoclingDocument in markdown format.
36+
37+
:param docling_document: Instance of DoclingDocument.
38+
:return: Text content in markdown format.
39+
"""
40+
return docling_document.to_markdown()
41+
42+
def _convert_to_docling(self, file_path: str) -> DoclingDocument:
43+
"""
44+
Converts a PDF file into a DoclingDocument instance.
45+
46+
:param file_path: Path to the PDF file to be converted.
47+
:return: DoclingDocument instance.
48+
"""
49+
# Placeholder for actual conversion logic using the Docling API
50+
try:
51+
docling_document = DoclingDocument.from_file(file_path)
52+
return docling_document
53+
except Exception as e:
54+
raise RuntimeError(f"Failed to convert document using Docling: {e}")
55+
56+
def _save_to_temp_file(self, file_format: FileFormat) -> str:
57+
"""
58+
Saves the content of a FileFormat instance to a temporary file.
59+
60+
:param file_format: Instance of FileFormat.
61+
:return: Path to the temporary file containing the file content.
62+
"""
63+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
64+
temp_file.write(file_format.get_content()) # Assuming get_content provides binary content
65+
return temp_file.name
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
### WARNING
3+
### This file is generated dynamically before git commit.
4+
### Run ./scripts/dev/gen-file-format-init.sh from repository root.
5+
6+
from .file_format import FileFormat
7+
from .pdf import PdfFileFormat
8+
from .image import ImageFileFormat

text_extract_api/main.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
# Define base path as text_extract_api - required for keeping absolute namespaces
2020
sys.path.insert(0, str(pathlib.Path(__file__).parent.resolve()))
2121

22-
2322
def storage_profile_exists(profile_name: str) -> bool:
2423
profile_path = os.path.abspath(
2524
os.path.join(os.getenv('STORAGE_PROFILE_PATH', './storage_profiles'), f'{profile_name}.yaml'))
@@ -29,13 +28,11 @@ def storage_profile_exists(profile_name: str) -> bool:
2928
return os.path.isfile(sub_profile_path)
3029
return True
3130

32-
3331
app = FastAPI()
3432
# Connect to Redis
3533
redis_url = os.getenv('REDIS_CACHE_URL', 'redis://redis:6379/1')
3634
redis_client = redis.StrictRedis.from_url(redis_url)
3735

38-
3936
@app.post("/ocr")
4037
async def ocr_endpoint(
4138
strategy: str = Form(...),

0 commit comments

Comments
 (0)