|
1 | | -import tempfile |
2 | | - |
3 | | -from typing import Optional |
4 | | -from docling_core.types.doc.document import DoclingDocument |
5 | | - |
6 | | -from docling_parse.docling_parse import pdf_parser_v2 |
7 | | - |
| 1 | +from text_extract_api.extract.extract_result import ExtractResult |
8 | 2 | from text_extract_api.extract.strategies.strategy import Strategy |
9 | | -from text_extract_api.files.file_formats.file_format import FileFormat |
| 3 | +from text_extract_api.files.file_formats import FileFormat, PdfFileFormat |
| 4 | +from docling import DoclingDocument # Assuming a compatible Docling library or module |
| 5 | +import tempfile |
10 | 6 |
|
11 | 7 | class DoclingStrategy(Strategy): |
12 | | - |
13 | | - def __init__(self): |
14 | | - super().__init__() |
15 | | - self._document: Optional[DoclingDocument] = None |
16 | | - self._current_file_format: Optional[FileFormat] = None |
17 | | - self._parser = pdf_parser_v2("error") # @todo move it to construct |
18 | | - |
19 | | - |
20 | | - @property |
21 | | - def document(self) -> Optional[DoclingDocument]: |
22 | | - """Access the current DoclingDocument instance""" |
23 | | - return self._document |
24 | | - |
25 | | - @classmethod |
26 | | - def name(cls) -> str: |
27 | | - return "docling" |
28 | | - |
29 | | -def extract_text(self, file_format: FileFormat, language: str = 'en') -> str: |
30 | | - |
31 | | - |
32 | | - with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file: |
33 | | - temp_file.write(image.binary) |
34 | | - temp_filename = temp_file.name |
35 | | - |
36 | | - doc_file = temp_filename |
37 | | - doc_key = f"key={file_format.filename}" |
38 | | - |
39 | | - success = self._parser.load_document(doc_key, doc_file) |
40 | | - |
41 | | - num_pages = self._parser.number_of_pages(doc_key) |
42 | | - |
43 | | - for page in range(0, num_pages): |
44 | | - |
45 | | - json_doc = self._parser.parse_pdf_from_key_on_page(doc_key, page) |
46 | | - |
47 | | - if "pages" not in json_doc: |
48 | | - continue |
49 | | - |
50 | | - json_page = json_doc["pages"][0] |
51 | | - print(json_page) |
52 | | - |
53 | | - |
54 | | - self._parser.unload_document(doc_key) |
| 8 | + """ |
| 9 | + Extraction strategy for processing PDF documents using Docling. |
| 10 | + """ |
| 11 | + |
| 12 | + def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult: |
| 13 | + """ |
| 14 | + Extracts text from a PDF file using Docling and returns an ExtractResult. |
| 15 | +
|
| 16 | + :param file_format: Instance of FileFormat (only supports PdfFileFormat). |
| 17 | + :param language: Language of the text (default is 'en'). |
| 18 | + :return: ExtractResult containing the extracted DoclingDocument and metadata. |
| 19 | + """ |
| 20 | + if not isinstance(file_format, PdfFileFormat): |
| 21 | + raise ValueError("DoclingStrategy only supports PdfFileFormat.") |
| 22 | + |
| 23 | + # Save file content to a temporary file |
| 24 | + temp_file_path = self._save_to_temp_file(file_format) |
| 25 | + |
| 26 | + # Convert the document using Docling |
| 27 | + docling_document = self._convert_to_docling(temp_file_path) |
| 28 | + |
| 29 | + print(docling_document) |
| 30 | + # Return the result wrapped in ExtractResult |
| 31 | + return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer) |
| 32 | + |
| 33 | + def text_gatherer(self, docling_document: DoclingDocument) -> str: |
| 34 | + """ |
| 35 | + Gathers text content from a DoclingDocument in markdown format. |
| 36 | +
|
| 37 | + :param docling_document: Instance of DoclingDocument. |
| 38 | + :return: Text content in markdown format. |
| 39 | + """ |
| 40 | + return docling_document.to_markdown() |
| 41 | + |
| 42 | + def _convert_to_docling(self, file_path: str) -> DoclingDocument: |
| 43 | + """ |
| 44 | + Converts a PDF file into a DoclingDocument instance. |
| 45 | +
|
| 46 | + :param file_path: Path to the PDF file to be converted. |
| 47 | + :return: DoclingDocument instance. |
| 48 | + """ |
| 49 | + # Placeholder for actual conversion logic using the Docling API |
| 50 | + try: |
| 51 | + docling_document = DoclingDocument.from_file(file_path) |
| 52 | + return docling_document |
| 53 | + except Exception as e: |
| 54 | + raise RuntimeError(f"Failed to convert document using Docling: {e}") |
| 55 | + |
| 56 | + def _save_to_temp_file(self, file_format: FileFormat) -> str: |
| 57 | + """ |
| 58 | + Saves the content of a FileFormat instance to a temporary file. |
| 59 | +
|
| 60 | + :param file_format: Instance of FileFormat. |
| 61 | + :return: Path to the temporary file containing the file content. |
| 62 | + """ |
| 63 | + with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: |
| 64 | + temp_file.write(file_format.get_content()) # Assuming get_content provides binary content |
| 65 | + return temp_file.name |
0 commit comments