1010
1111
1212def process_pdf2text (url : str , startPN : int , endPN : int ):
13+ """
14+ Extract textual content from given pdf.
15+
16+ * Validate the got url.
17+ * Extract textual contents of the pdf from given start page number to end page number (inclusive).
18+ * if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
19+ * if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
20+
21+ NOTE: Page numbers start from 1, while the underlying list data structure index starts from 0
22+ """
1323 import pypdf
1424 import io
1525 gotVU = uv .validate_url (url , "HandlePdf2Text" )
@@ -20,12 +30,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
2030 dPdf = fPdf .read ()
2131 tPdf = ""
2232 oPdf = pypdf .PdfReader (io .BytesIO (dPdf ))
23- if (startPN < 0 ):
24- startPN = 0
25- if (endPN < 0 ) or (endPN >= len (oPdf .pages )):
26- endPN = len (oPdf .pages )- 1
33+ if (startPN <= 0 ):
34+ startPN = 1
35+ if (endPN <= 0 ) or (endPN > len (oPdf .pages )):
36+ endPN = len (oPdf .pages )
2737 for i in range (startPN , endPN + 1 ):
28- pd = oPdf .pages [i ]
38+ pd = oPdf .pages [i - 1 ]
2939 tPdf = tPdf + pd .extract_text ()
3040 return { 'status' : 200 , 'msg' : "Pdf2Text Response follows" , 'data' : tPdf }
3141
@@ -37,16 +47,12 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
3747 """
3848 queryParams = urllib .parse .parse_qs (pr .query )
3949 url = queryParams ['url' ][0 ]
40- startP = queryParams ['startPageNumber' ][0 ]
41- if startP :
42- startP = int (startP )
43- else :
44- startP = - 1
45- endP = queryParams ['endPageNumber' ][0 ]
46- if endP :
47- endP = int (endP )
48- else :
49- endP = - 1
50+ startP = queryParams .get ('startPageNumber' , - 1 )
51+ if isinstance (startP , list ):
52+ startP = int (startP [0 ])
53+ endP = queryParams .get ('endPageNumber' , - 1 )
54+ if isinstance (endP , list ):
55+ endP = int (endP [0 ])
5056 print (f"INFO:HandlePdf2Text:Processing:{ url } :{ startP } :{ endP } ..." )
5157 gotP2T = process_pdf2text (url , startP , endP )
5258 if (gotP2T ['status' ] != 200 ):
0 commit comments