Skip to content

Commit db78688

Browse files
committed
✨ add getting page count from a local input source
1 parent 45845d4 commit db78688

File tree

8 files changed

+400
-323
lines changed

8 files changed

+400
-323
lines changed

src/main/java/com/mindee/extraction/PDFExtractor.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,26 +111,24 @@ public List<ExtractedPDF> extractSubDocuments(List<List<Integer>> pageIndexes)
111111
return extractedPDFs;
112112
}
113113

114-
115114
/**
116115
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
117116
*
118117
* @param pageIndexes List of page indexes.
119118
* @return a list of extracted files.
120119
* @throws IOException Throws if the file can't be accessed.
121120
*/
122-
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes)
123-
throws IOException {
121+
public List<ExtractedPDF> extractInvoices(
122+
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes
123+
) throws IOException {
124124

125125
List<List<Integer>> indexes =
126126
pageIndexes.stream().map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
127127
.collect(Collectors.toList());
128128

129-
130129
return extractSubDocuments(indexes);
131130
}
132131

133-
134132
/**
135133
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
136134
*
@@ -139,8 +137,10 @@ public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup
139137
* @return a list of extracted files.
140138
* @throws IOException Throws if the file can't be accessed.
141139
*/
142-
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
143-
boolean strict) throws IOException {
140+
public List<ExtractedPDF> extractInvoices(
141+
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
142+
boolean strict
143+
) throws IOException {
144144
List<List<Integer>> correctPageIndexes = new ArrayList<>();
145145
if (!strict) {
146146
return extractInvoices(pageIndexes);

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.mindee.input;
22

33
import com.mindee.image.ImageCompressor;
4+
import com.mindee.pdf.PDFUtils;
45
import com.mindee.pdf.PdfBoxApi;
56
import com.mindee.pdf.PdfCompressor;
67
import com.mindee.pdf.PdfOperation;
@@ -48,6 +49,17 @@ public LocalInputSource(String fileAsBase64, String filename) {
4849
this.filename = filename;
4950
}
5051

52+
/**
53+
* Get the number of pages in the document.
54+
* @return the number of pages in the current file.
55+
* @throws IOException If an I/O error occurs during the PDF operation.
56+
*/
57+
public int getPageCount() throws IOException {
58+
if (!this.isPdf()) {
59+
return 1;
60+
}
61+
return PDFUtils.getNumberOfPages(this.file);
62+
}
5163

5264
/**
5365
* Applies PDF-specific operations on the current file based on the specified {@code PageOptions}.

src/main/java/com/mindee/pdf/PDFUtils.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,16 @@ private PDFUtils() {
3939
* @param inputSource The PDF file.
4040
*/
4141
public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
42-
PDDocument document = Loader.loadPDF(inputSource.getFile());
42+
return getNumberOfPages(inputSource.getFile());
43+
}
44+
45+
/**
46+
* Get the number of pages in the PDF.
47+
*
48+
* @param pdfBytes The PDF file as a byte array.
49+
*/
50+
public static int getNumberOfPages(byte[] pdfBytes) throws IOException {
51+
PDDocument document = Loader.loadPDF(pdfBytes);
4352
int pageCount = document.getNumberOfPages();
4453
document.close();
4554
return pageCount;

src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@ protected Document<InvoiceSplitterV1> getInvoiceSplitterPrediction() throws
4242
protected PredictResponse<InvoiceV4> getInvoicePrediction(LocalInputSource invoicePDF) throws
4343
IOException, MindeeException {
4444
return client.parse(InvoiceV4.class, invoicePDF);
45-
4645
}
4746

48-
4947
protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> invoicePrediction)
5048
throws IOException {
5149
List<String> rstRefLines = Files.readAllLines(Paths.get(rstFilePath));
@@ -60,7 +58,7 @@ protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> in
6058
}
6159

6260
@Test
63-
public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, InterruptedException {
61+
public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedException {
6462
Document<InvoiceSplitterV1> document = getInvoiceSplitterPrediction();
6563
InvoiceSplitterV1 inference = document.getInference();
6664

@@ -77,23 +75,30 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, Interrup
7775
String testStringRSTInvoice0 = prepareInvoiceReturn(
7876
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst",
7977
invoice0.getDocument());
80-
Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"),
81-
invoice0.getDocument().toString().split(System.lineSeparator())));
78+
79+
double invoice0Ratio = levenshteinRatio(
80+
testStringRSTInvoice0,
81+
String.join(
82+
String.format("%n"),
83+
invoice0.getDocument().toString().split(System.lineSeparator())
84+
)
85+
);
86+
Assertions.assertTrue(invoice0Ratio > 0.90);
8287

8388
PredictResponse<InvoiceV4> invoice1 =
8489
getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource());
8590

8691
String testStringRSTInvoice1 = prepareInvoiceReturn(
8792
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst",
8893
invoice1.getDocument());
89-
Assertions.assertTrue(
90-
levenshteinRatio(
94+
95+
double invoice1Ratio = levenshteinRatio(
9196
testStringRSTInvoice1,
92-
String.join(String.format("%n"),
93-
invoice1.getDocument().toString().split(System.lineSeparator())
97+
String.join(
98+
String.format("%n"),
99+
invoice1.getDocument().toString().split(System.lineSeparator())
94100
)
95-
) > 0.97);
96-
97-
101+
);
102+
Assertions.assertTrue(invoice1Ratio > 0.90);
98103
}
99104
}

0 commit comments

Comments
 (0)