Skip to content

Commit a7178d7

Browse files
committed
[feat] image to PDF converter as marker-pdf supports only PDF files
1 parent 959105b commit a7178d7

File tree

4 files changed

+106
-1
lines changed

4 files changed

+106
-1
lines changed

config/strategies.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ strategies:
55
class: text_extract_api.extract.strategies.minicpm_v.MiniCPMVStrategy
66
easyocr:
77
class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
8+
marker:
9+
class: text_extract_api.extract.strategies.marker.MarkerStrategy
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import os
2+
import tempfile
3+
import time
4+
5+
from text_extract_api.extract.strategies.strategy import Strategy
6+
from text_extract_api.files.file_formats.file_format import FileFormat
7+
from text_extract_api.files.file_formats.image import ImageFileFormat
8+
from text_extract_api.files.file_formats.pdf import PdfFileFormat
9+
import requests
10+
11+
12+
class MarkerStrategy(Strategy):
13+
"""Marker PDF via API - strategy"""
14+
15+
@classmethod
16+
def name(cls) -> str:
17+
return "marker"
18+
19+
def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
20+
21+
if (
22+
not isinstance(file_format, PdfFileFormat)
23+
and not file_format.can_convert_to(PdfFileFormat)
24+
):
25+
raise TypeError(
26+
f"Marker PDF - format {file_format.mime_type} is not supported (yet?)"
27+
)
28+
29+
pdf_files = FileFormat.convert_to(file_format, PdfFileFormat)
30+
extracted_text = ""
31+
start_time = time.time()
32+
ocr_percent_done = 0
33+
34+
if len(pdf_files) > 1:
35+
raise ValueError("Only one PDF file is supported.")
36+
37+
if len(pdf_files) == 0:
38+
raise ValueError("No PDF file found - conversion error.")
39+
40+
try:
41+
url = os.getenv("MARKER_API_URL", "http://localhost:8002/marker/upload")
42+
files = {'file': ('document.pdf', pdf_files[0].binary, 'application/pdf')}
43+
data = {
44+
'page_range': None,
45+
'languages': language,
46+
'force_ocr': False,
47+
'paginate_output': False,
48+
'output_format': 'json' # TODO: support JSON output format
49+
}
50+
51+
meta = {
52+
'progress': str(30 + ocr_percent_done),
53+
'status': 'OCR Processing',
54+
'start_time': start_time,
55+
'elapsed_time': time.time() - start_time}
56+
self.update_state_callback(state='PROGRESS', meta=meta)
57+
58+
response = requests.post(url, files=files, data=data)
59+
if response.status_code != 200:
60+
raise Exception(f"Failed to upload PDF file: {response.content}")
61+
62+
extracted_text = response.json().get('output', '')
63+
except Exception as e:
64+
print('Error:', e)
65+
raise Exception("Failed to generate text with Marker PDF API. Make sure marker-pdf server is up and running: marker_server --port 8002. Details: https://github.com/VikParuchuri/marker")
66+
67+
return extracted_text
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from io import BytesIO
2+
from typing import Iterator, Type
3+
from PIL import Image
4+
from files.converters.converter import Converter
5+
from files.file_formats.image import ImageFileFormat
6+
from files.file_formats.pdf import PdfFileFormat
7+
8+
9+
class ImageToPdfConverter(Converter):
10+
11+
@staticmethod
12+
def convert(file_format: ImageFileFormat) -> Iterator[Type["PdfFileFormat"]]:
13+
14+
image = Image.open(BytesIO(file_format.binary))
15+
pdf_bytes = ImageToPdfConverter._image_to_pdf_bytes(image)
16+
yield PdfFileFormat.from_binary(
17+
binary=pdf_bytes,
18+
filename=f"{file_format.filename}.pdf",
19+
mime_type="application/pdf"
20+
)
21+
22+
@staticmethod
23+
def _image_to_pdf_bytes(image: Image) -> bytes:
24+
25+
buffer = BytesIO()
26+
image.save(buffer, format="PDF")
27+
return buffer.getvalue()

text_extract_api/files/file_formats/image.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from enum import Enum
2-
from typing import Type
2+
from typing import Callable, Dict, Iterator, Type
33
from io import BytesIO
44
from PIL import Image
55

@@ -17,6 +17,15 @@ class ImageFileFormat(FileFormat):
1717
@staticmethod
1818
def accepted_mime_types() -> list[str]:
1919
return ["image/jpeg", "image/png", "image/bmp", "image/gif", "image/tiff"]
20+
21+
@staticmethod
22+
def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
23+
from text_extract_api.files.file_formats.pdf import PdfFileFormat
24+
from text_extract_api.files.converters.image_to_pdf import ImageToPdfConverter
25+
26+
return {
27+
PdfFileFormat: ImageToPdfConverter.convert
28+
}
2029

2130
@staticmethod
2231
def is_pageable() -> bool:

0 commit comments

Comments
 (0)