@@ -37,12 +37,12 @@ def extract_text_from_pdf(pdf_bytes: bytes) -> List[List[PDFCharData]]:
3737 char_data_list : List [List [PDFCharData ]] = []
3838
3939 for i , page in enumerate (pdf ):
40- char_data_list .append (process_page (page , i , pdfium_lock ))
40+ char_data_list .append (_process_page (page , i , pdfium_lock ))
4141
4242 return char_data_list
4343
4444
45- def process_page (page , page_id : int , pdfium_lock : RLock ) -> List [PDFCharData ]:
45+ def _process_page (page , page_id : int , pdfium_lock : RLock ) -> List [PDFCharData ]:
4646 """
4747 Processes a single page of the PDF.
4848
@@ -59,7 +59,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
5959 count_chars = pdfium_c .FPDFText_CountChars (text_handler )
6060
6161 for i in range (count_chars ):
62- concatenated_chars = process_char (
62+ concatenated_chars = _process_char (
6363 i , text_handler , page , pdfium_lock , internal_height , internal_width , page_id
6464 )
6565 for concatenated_char in concatenated_chars :
@@ -70,7 +70,7 @@ def process_page(page, page_id: int, pdfium_lock: RLock) -> List[PDFCharData]:
7070 return char_data_list
7171
7272
73- def process_char (
73+ def _process_char (
7474 i : int ,
7575 text_handler ,
7676 page ,
@@ -91,21 +91,21 @@ def process_char(
9191 :param page_id: ID of the page the character was found on.
9292 :return: List of character data for a page.
9393 """
94- char_info = get_char_info (i , text_handler , pdfium_lock )
94+ char_info = _get_char_info (i , text_handler , pdfium_lock )
9595 if not char_info :
9696 return []
97- char_box = get_char_box (i , text_handler , pdfium_lock )
98- rotation = get_page_rotation (page , pdfium_lock )
97+ char_box = _get_char_box (i , text_handler , pdfium_lock )
98+ rotation = _get_page_rotation (page , pdfium_lock )
9999
100- adjusted_box = adjust_char_box (char_box , rotation , internal_height , internal_width )
100+ adjusted_box = _adjust_char_box (char_box , rotation , internal_height , internal_width )
101101 char_data_list : List [PDFCharData ] = []
102102 for c in char_info ["char" ] or " " :
103103 if c in (
104104 "\n " ,
105105 "\r " ,
106106 ): # Removes duplicated carriage returns in the PDF due to weird extraction.
107107 # IDK how to make this better, and neither does Claude, GPT4 nor GPT-o1, so I'm leaving this weird check.
108- next_char_info = get_char_info (i + 1 , text_handler , pdfium_lock )
108+ next_char_info = _get_char_info (i + 1 , text_handler , pdfium_lock )
109109 if not next_char_info or next_char_info ["char" ] in ("\n " , "\r " ):
110110 continue
111111
@@ -128,7 +128,7 @@ def process_char(
128128 return char_data_list
129129
130130
131- def get_char_info (i : int , text_handler , pdfium_lock : RLock ) -> dict :
131+ def _get_char_info (i : int , text_handler , pdfium_lock : RLock ) -> dict :
132132 """
133133 Retrieves information about a specific character.
134134
@@ -145,8 +145,8 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
145145 if unicode_char == 0xFF :
146146 return {}
147147 char = chr (unicode_char )
148- font_name = get_font_name (text_handler , i )
149- font_flags = get_font_flags (text_handler , i )
148+ font_name = _get_font_name (text_handler , i )
149+ font_flags = _get_font_flags (text_handler , i )
150150 font_size = pdfium_c .FPDFText_GetFontSize (text_handler , i )
151151 font_weight = pdfium_c .FPDFText_GetFontWeight (text_handler , i )
152152 _ = pdfium_c .FPDFText_GetStrokeColor (
@@ -167,7 +167,7 @@ def get_char_info(i: int, text_handler, pdfium_lock: RLock) -> dict:
167167 }
168168
169169
170- def get_font_name (text_handler , i : int ) -> str :
170+ def _get_font_name (text_handler , i : int ) -> str :
171171 """
172172 Retrieves the font name for a specific character.
173173
@@ -186,7 +186,7 @@ def get_font_name(text_handler, i: int) -> str:
186186 )
187187
188188
189- def get_font_flags (text_handler , i : int ) -> int :
189+ def _get_font_flags (text_handler , i : int ) -> int :
190190 """
191191 Retrieves the font flags for a specific character.
192192
@@ -199,7 +199,7 @@ def get_font_flags(text_handler, i: int) -> int:
199199 return flags .value
200200
201201
202- def get_char_box (
202+ def _get_char_box (
203203 i : int , text_handler , pdfium_lock : RLock
204204) -> Tuple [float , float , float , float ]:
205205 """
@@ -218,7 +218,7 @@ def get_char_box(
218218 return left .value , right .value , bottom .value , top .value
219219
220220
221- def get_page_rotation (page , pdfium_lock : RLock ) -> int :
221+ def _get_page_rotation (page , pdfium_lock : RLock ) -> int :
222222 """
223223 Retrieves the rotation value for a specific page.
224224
@@ -232,7 +232,7 @@ def get_page_rotation(page, pdfium_lock: RLock) -> int:
232232 )
233233
234234
235- def adjust_char_box (
235+ def _adjust_char_box (
236236 char_box : Tuple [float , float , float , float ],
237237 rotation : int ,
238238 internal_height : float ,
@@ -263,3 +263,15 @@ def adjust_char_box(
263263 internal_height - left ,
264264 )
265265 return left , right , top , bottom
266+
267+
268+ def lerp (start : float , end : float , t : float ) -> float :
269+ """
270+ Performs linear interpolation between two numbers.
271+
272+ :param start: The starting value.
273+ :param end: The ending value.
274+ :param t: The interpolation factor (0 to 1).
275+ :return: The interpolated value.
276+ """
277+ return start * (1 - t ) + end * t
0 commit comments