|
| 1 | +import os |
| 2 | +import tempfile |
| 3 | +import time |
| 4 | + |
| 5 | +from text_extract_api.extract.strategies.strategy import Strategy |
| 6 | +from text_extract_api.files.file_formats.file_format import FileFormat |
| 7 | +from text_extract_api.files.file_formats.image import ImageFileFormat |
| 8 | +from text_extract_api.files.file_formats.pdf import PdfFileFormat |
| 9 | +import requests |
| 10 | + |
| 11 | + |
| 12 | +class MarkerStrategy(Strategy): |
| 13 | + """Marker PDF via API - strategy""" |
| 14 | + |
| 15 | + @classmethod |
| 16 | + def name(cls) -> str: |
| 17 | + return "marker" |
| 18 | + |
| 19 | + def extract_text(self, file_format: FileFormat, language: str = 'en') -> str: |
| 20 | + |
| 21 | + if ( |
| 22 | + not isinstance(file_format, PdfFileFormat) |
| 23 | + and not file_format.can_convert_to(PdfFileFormat) |
| 24 | + ): |
| 25 | + raise TypeError( |
| 26 | + f"Marker PDF - format {file_format.mime_type} is not supported (yet?)" |
| 27 | + ) |
| 28 | + |
| 29 | + pdf_files = FileFormat.convert_to(file_format, PdfFileFormat) |
| 30 | + extracted_text = "" |
| 31 | + start_time = time.time() |
| 32 | + ocr_percent_done = 0 |
| 33 | + |
| 34 | + if len(pdf_files) > 1: |
| 35 | + raise ValueError("Only one PDF file is supported.") |
| 36 | + |
| 37 | + if len(pdf_files) == 0: |
| 38 | + raise ValueError("No PDF file found - conversion error.") |
| 39 | + |
| 40 | + try: |
| 41 | + url = os.getenv("MARKER_API_URL", "http://localhost:8002/marker/upload") |
| 42 | + files = {'file': ('document.pdf', pdf_files[0].binary, 'application/pdf')} |
| 43 | + data = { |
| 44 | + 'page_range': None, |
| 45 | + 'languages': language, |
| 46 | + 'force_ocr': False, |
| 47 | + 'paginate_output': False, |
| 48 | + 'output_format': 'json' # TODO: support JSON output format |
| 49 | + } |
| 50 | + |
| 51 | + meta = { |
| 52 | + 'progress': str(30 + ocr_percent_done), |
| 53 | + 'status': 'OCR Processing', |
| 54 | + 'start_time': start_time, |
| 55 | + 'elapsed_time': time.time() - start_time} |
| 56 | + self.update_state_callback(state='PROGRESS', meta=meta) |
| 57 | + |
| 58 | + response = requests.post(url, files=files, data=data) |
| 59 | + if response.status_code != 200: |
| 60 | + raise Exception(f"Failed to upload PDF file: {response.content}") |
| 61 | + |
| 62 | + extracted_text = response.json().get('output', '') |
| 63 | + except Exception as e: |
| 64 | + print('Error:', e) |
| 65 | + raise Exception("Failed to generate text with Marker PDF API. Make sure marker-pdf server is up and running: marker_server --port 8002. Details: https://github.com/VikParuchuri/marker") |
| 66 | + |
| 67 | + return extracted_text |
0 commit comments