Skip to content

Commit 839b02f

Browse files
committed
SimpleChatTC:SimpleProxy:Pdf2Text cleanup page number handling
Its not necessary to request a page number range always. Take care of page number starting from 1 and underlying data having 0 as the starting index
1 parent c577ff0 commit 839b02f

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

tools/server/public_simplechat/local.tools/pdfmagic.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@
1010

1111

1212
def process_pdf2text(url: str, startPN: int, endPN: int):
13+
"""
14+
Extract textual content from given pdf.
15+
16+
* Validate the got url.
17+
* Extract textual contents of the pdf from given start page number to end page number (inclusive).
18+
* if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
19+
* if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
20+
21+
NOTE: Page numbers start from 1, while the underlying list data structure index starts from 0
22+
"""
1323
import pypdf
1424
import io
1525
gotVU = uv.validate_url(url, "HandlePdf2Text")
@@ -20,12 +30,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
2030
dPdf = fPdf.read()
2131
tPdf = ""
2232
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
23-
if (startPN < 0):
24-
startPN = 0
25-
if (endPN < 0) or (endPN >= len(oPdf.pages)):
26-
endPN = len(oPdf.pages)-1
33+
if (startPN <= 0):
34+
startPN = 1
35+
if (endPN <= 0) or (endPN > len(oPdf.pages)):
36+
endPN = len(oPdf.pages)
2737
for i in range(startPN, endPN+1):
28-
pd = oPdf.pages[i]
38+
pd = oPdf.pages[i-1]
2939
tPdf = tPdf + pd.extract_text()
3040
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
3141

@@ -37,16 +47,12 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
3747
"""
3848
queryParams = urllib.parse.parse_qs(pr.query)
3949
url = queryParams['url'][0]
40-
startP = queryParams['startPageNumber'][0]
41-
if startP:
42-
startP = int(startP)
43-
else:
44-
startP = -1
45-
endP = queryParams['endPageNumber'][0]
46-
if endP:
47-
endP = int(endP)
48-
else:
49-
endP = -1
50+
startP = queryParams.get('startPageNumber', -1)
51+
if isinstance(startP, list):
52+
startP = int(startP[0])
53+
endP = queryParams.get('endPageNumber', -1)
54+
if isinstance(endP, list):
55+
endP = int(endP[0])
5056
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
5157
gotP2T = process_pdf2text(url, startP, endP)
5258
if (gotP2T['status'] != 200):

0 commit comments

Comments
 (0)