11import io
22import logging
3+ from ctypes import c_char_p , c_ushort
34from threading import RLock
45from typing import BinaryIO , List , Optional , Union
56
67import pypdfium2 as pdfium
78import pypdfium2 .raw as pdfium_c
9+ from _ctypes import POINTER
810
911from mindee .image_operations .image_compressor import compress_image
1012from mindee .pdf .pdf_char_data import PDFCharData
@@ -34,9 +36,12 @@ def compress_pdf(
3436 :return: Compressed PDF as bytes.
3537 """
3638 if not isinstance (pdf_data , bytes ):
37- pdf_data = pdf_data .read ()
39+ pdf_bytes = pdf_data .read ()
40+ pdf_data .seek (0 )
41+ else :
42+ pdf_bytes = pdf_data
3843
39- if has_source_text (pdf_data ):
44+ if has_source_text (pdf_bytes ):
4045 if force_source_text_compression :
4146 if not disable_source_text :
4247 logger .warning ("Re-writing PDF source-text is an EXPERIMENTAL feature." )
@@ -50,29 +55,29 @@ def compress_pdf(
5055 "Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
5156 "is set to 'true'."
5257 )
53- return pdf_data
58+ return pdf_bytes
5459
5560 extracted_text = (
56- extract_text_from_pdf (pdf_data ) if not disable_source_text else None
61+ extract_text_from_pdf (pdf_bytes ) if not disable_source_text else None
5762 )
5863
5964 compressed_pages = compress_pdf_pages (
60- pdf_data , extracted_text , image_quality , disable_source_text
65+ pdf_bytes , extracted_text , image_quality , disable_source_text
6166 )
6267
6368 if not compressed_pages :
6469 logger .warning (
6570 "Could not compress PDF to a smaller size. Returning original PDF."
6671 )
67- return pdf_data
72+ return pdf_bytes
6873
6974 out_pdf = attach_images_as_new_file (
7075 [io .BytesIO (compressed_page ) for compressed_page in compressed_pages ]
7176 )
72- out_bytes = io .BytesIO ()
73- out_pdf .save (out_bytes )
74-
75- return out_bytes .read ()
77+ out_buffer = io .BytesIO ()
78+ out_pdf .save (out_buffer )
79+ out_buffer . seek ( 0 )
80+ return out_buffer .read ()
7681
7782
7883def compress_pdf_pages (
@@ -110,40 +115,40 @@ def compress_pdf_pages(
110115
111116
112117def add_text_to_pdf_page ( # type: ignore
113- page : pdfium .PdfPage ,
118+ document : pdfium .PdfDocument ,
119+ page_id : int ,
114120 extracted_text : Optional [List [PDFCharData ]],
115121) -> None :
116122 """
117123 Adds text to a PDF page based on the extracted text data.
118124
119- :param page: The PdfPage object to add text to.
125+ :param document: The PDFDocument object.
126+ :param page_id: ID of the current page.
120127 :param extracted_text: List of PDFCharData objects containing text and positioning information.
121128 """
122129 if not extracted_text :
123130 return
124131
125- height = page .get_height ()
126- document = page .pdf
132+ height = document [page_id ].get_height ()
127133 pdfium_lock = RLock ()
128134
129135 with pdfium_lock :
130- text_handler = pdfium_c .FPDFText_LoadPage (page .raw )
131136 for char_data in extracted_text :
132- font = document .load_font (
133- char_data .font_name , pdfium_c .FPDF_FONT_TRUETYPE , True
137+ font_name = c_char_p (char_data .font_name .encode ("utf-8" ))
138+ text_handler = pdfium_c .FPDFPageObj_NewTextObj (
139+ document .raw , font_name , char_data .font_size
134140 )
135- text_object = document .create_text_object (font , char_data .font_size )
136- text_object .set_text (char_data .char )
137- x = char_data .left
138- y = height - char_data .bottom
139- text_object .set_position (x , y )
140- r , g , b , a = char_data .font_fill_color
141- text_object .set_fill_color (r , g , b , a )
142- pdfium_c .FPDFPage_InsertObject (text_handler , text_object )
143- pdfium_c .FPDFPage_GenerateContent (text_handler )
144-
145- with pdfium_lock :
146- pdfium_c .FPDFText_ClosePage (text_handler )
141+ char_code = ord (char_data .char )
142+ char_code_c_char = c_ushort (char_code )
143+ char_ptr = POINTER (c_ushort )(char_code_c_char )
144+ pdfium_c .FPDFText_SetText (text_handler , char_ptr )
145+ pdfium_c .FPDFPageObj_Transform (
146+ text_handler , 1 , 0 , 0 , 1 , char_data .left , height - char_data .top
147+ )
148+ pdfium_c .FPDFPage_InsertObject (document [page_id ].raw , text_handler )
149+ pdfium_c .FPDFPageObj_Destroy (text_handler )
150+ pdfium_c .FPDFPage_GenerateContent (document [page_id ].raw )
151+ pdfium_c .FPDF_ClosePage (document [page_id ].raw )
147152
148153
149154def compress_pages_with_quality (
@@ -164,12 +169,12 @@ def compress_pages_with_quality(
164169 pdf_document = pdfium .PdfDocument (pdf_data )
165170 compressed_pages = []
166171
167- for [_ , page ] in enumerate (pdf_document ):
172+ for [i , page ] in enumerate (pdf_document ):
168173 rasterized_page = rasterize_page (page , image_quality )
169174 compressed_image = compress_image (rasterized_page , image_quality )
170175
171176 if not disable_source_text :
172- add_text_to_pdf_page (page , extracted_text )
177+ add_text_to_pdf_page (pdf_document , i , extracted_text )
173178
174179 compressed_pages .append (compressed_image )
175180
0 commit comments