Skip to content

Commit 7bb1ce4

Browse files
♻️ update integration tests, add examples for latest features (#198)
1 parent 8b20b03 commit 7bb1ce4

File tree

5 files changed

+220
-11
lines changed

5 files changed

+220
-11
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import com.mindee.MindeeClient;
2+
import com.mindee.input.LocalInputSource;
3+
import com.mindee.extraction.ExtractedPDF;
4+
import com.mindee.extraction.PDFExtractor;
5+
import com.mindee.parsing.common.AsyncPredictResponse;
6+
import com.mindee.product.invoice.InvoiceV4;
7+
import com.mindee.product.invoicesplitter.InvoiceSplitterV1;
8+
9+
import java.io.File;
10+
import java.io.IOException;
11+
import java.util.List;
12+
13+
public class AutoInvoiceSplitterExtractionExample {
14+
private static final String API_KEY = "my-api-key";
15+
private static final MindeeClient mindeeClient = new MindeeClient(API_KEY);
16+
17+
public static void main(String[] args) throws IOException, InterruptedException {
18+
String filePath = "/path/to/the/file.ext";
19+
invoiceSplitterAutoExtraction(filePath);
20+
}
21+
22+
private static void invoiceSplitterAutoExtraction(String filePath) throws IOException, InterruptedException {
23+
LocalInputSource inputSource = new LocalInputSource(new File(filePath));
24+
25+
if (inputSource.isPdf() && new PDFExtractor(inputSource).getPageCount() > 1) {
26+
parseMultiPage(inputSource);
27+
} else {
28+
parseSinglePage(inputSource);
29+
}
30+
}
31+
32+
private static void parseSinglePage(LocalInputSource inputSource) throws IOException, InterruptedException {
33+
AsyncPredictResponse<InvoiceV4> invoiceResult = mindeeClient.enqueueAndParse(InvoiceV4.class, inputSource);
34+
System.out.println(invoiceResult.getDocumentObj().toString());
35+
}
36+
37+
private static void parseMultiPage(LocalInputSource inputSource) throws IOException, InterruptedException {
38+
PDFExtractor extractor = new PDFExtractor(inputSource);
39+
AsyncPredictResponse<InvoiceSplitterV1> invoiceSplitterResponse =
40+
mindeeClient.enqueueAndParse(InvoiceSplitterV1.class, inputSource);
41+
42+
List<ExtractedPDF> extractedPdfs = extractor.extractInvoices(
43+
invoiceSplitterResponse.getDocumentObj().getInference().getPrediction().getInvoicePageGroups(),
44+
false
45+
);
46+
47+
for (ExtractedPDF extractedPdf : extractedPdfs) {
48+
// Optional: Save the files locally
49+
// extractedPdf.writeToFile("output/path");
50+
51+
AsyncPredictResponse<InvoiceV4> invoiceResult =
52+
mindeeClient.enqueueAndParse(InvoiceV4.class, extractedPdf.asInputSource());
53+
System.out.println(invoiceResult.getDocumentObj().toString());
54+
}
55+
}
56+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import com.mindee.MindeeClient;
2+
import com.mindee.input.LocalInputSource;
3+
import com.mindee.extraction.ExtractedImage;
4+
import com.mindee.extraction.ImageExtractor;
5+
import com.mindee.parsing.common.PredictResponse;
6+
import com.mindee.parsing.common.Page;
7+
import com.mindee.product.multireceiptsdetector.MultiReceiptsDetectorV1;
8+
import com.mindee.product.multireceiptsdetector.MultiReceiptsDetectorV1Document;
9+
import com.mindee.product.receipt.ReceiptV5;
10+
11+
import java.io.File;
12+
import java.io.IOException;
13+
import java.util.List;
14+
15+
public class AutoMultiReceiptExtractionExample {
16+
private static final String API_KEY = "my-api-key";
17+
private static final MindeeClient mindeeClient = new MindeeClient(API_KEY);
18+
19+
public static void main(String[] args) throws IOException, InterruptedException {
20+
String myFilePath = "/path/to/the/file.ext";
21+
processMultiReceipts(myFilePath);
22+
}
23+
24+
private static void processMultiReceipts(String filePath) throws IOException, InterruptedException {
25+
LocalInputSource inputSource = new LocalInputSource(new File(filePath));
26+
PredictResponse<MultiReceiptsDetectorV1> resultSplit =
27+
mindeeClient.parse(MultiReceiptsDetectorV1.class, inputSource);
28+
29+
ImageExtractor imageExtractor = new ImageExtractor(inputSource);
30+
31+
for (Page<MultiReceiptsDetectorV1Document> page : resultSplit.getDocument().getInference().getPages()) {
32+
List<ExtractedImage> subImages = imageExtractor.extractImagesFromPage(
33+
page.getPrediction().getReceipts(),
34+
page.getPageId()
35+
);
36+
37+
for (ExtractedImage subImage : subImages) {
38+
// Optionally: write to a file
39+
// subImage.writeToFile("/path/to/my/extracted/file/folder");
40+
41+
PredictResponse<ReceiptV5> resultReceipt =
42+
mindeeClient.parse(ReceiptV5.class, subImage.asInputSource());
43+
System.out.println(resultReceipt.getDocument().toString());
44+
}
45+
}
46+
}
47+
}

src/test/java/com/mindee/TestingUtilities.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
package com.mindee;
22

3+
import java.util.Objects;
4+
35
public class TestingUtilities {
6+
/**
7+
* Retrieves the version from an RST prediction output.
8+
*
9+
* @param rstStr An RST prediction output string.
10+
* @return The version.
11+
*/
412
public static String getVersion(String rstStr) {
513
int versionLineStartPos = rstStr.indexOf(":Product: ");
614
int versionEndPos = rstStr.indexOf("\n", versionLineStartPos);
@@ -11,18 +19,102 @@ public static String getVersion(String rstStr) {
1119
return substring.substring(versionStartPos + 2);
1220
}
1321

22+
/**
23+
* Retrieves an ID from an RST prediction output string.
24+
*
25+
* @param rstStr An RST prediction output string.
26+
* @return The ID.
27+
*/
1428
public static String getId(String rstStr) {
1529
int idStartPos = rstStr.indexOf(":Mindee ID: ") + 12;
1630
int idEndPos = rstStr.indexOf("\n:Filename:");
1731

1832
return rstStr.substring(idStartPos, idEndPos);
1933
}
2034

35+
/**
36+
* Retrieves a filename from an RST prediction output string.
37+
*
38+
* @param rstStr An RST output string.
39+
* @return The filename.
40+
*/
2141
public static String getFileName(String rstStr) {
2242
int idStartPos = rstStr.indexOf(":Filename: ") + 11;
2343
int idEndPos = rstStr.indexOf("\n\nInference");
2444

2545
return rstStr.substring(idStartPos, idEndPos);
2646
}
47+
48+
/**
49+
* Compute the Levenshtein distance between two strings.
50+
* Taken & adapted from <a href="https://rosettacode.org/wiki/Levenshtein_distance#Iterative_space_optimized_(even_bounded)">here</a>
51+
*
52+
* @param refStr Source string to compare.
53+
* @param targetStr Target string to compare.
54+
* @return The Levenshtein distance between the two strings.
55+
*/
56+
private static int levenshteinDistance(String refStr, String targetStr) {
57+
if (Objects.equals(refStr, targetStr)) {
58+
return 0;
59+
}
60+
int sourceLength = refStr.length();
61+
int targetLength = targetStr.length();
62+
if (sourceLength == 0) {
63+
return targetLength;
64+
}
65+
if (targetLength == 0) {
66+
return sourceLength;
67+
}
68+
if (sourceLength < targetLength) {
69+
int tempLength = sourceLength;
70+
sourceLength = targetLength;
71+
targetLength = tempLength;
72+
String tempString = refStr;
73+
refStr = targetStr;
74+
targetStr = tempString;
75+
}
76+
77+
int[] distanceVector = new int[targetLength + 1];
78+
for (int i = 0; i <= targetLength; i++) {
79+
distanceVector[i] = i;
80+
}
81+
82+
for (int i = 1; i <= sourceLength; i++) {
83+
distanceVector[0] = i;
84+
int previousDistance = i - 1;
85+
int minDistance = previousDistance;
86+
for (int j = 1; j <= targetLength; j++) {
87+
int currentDistance =
88+
previousDistance + (refStr.charAt(i - 1) == targetStr.charAt(j - 1) ? 0 : 1);
89+
distanceVector[j] =
90+
Math.min(Math.min(1 + (previousDistance = distanceVector[j]), 1 + distanceVector[j - 1]),
91+
currentDistance);
92+
if (previousDistance < minDistance) {
93+
minDistance = previousDistance;
94+
}
95+
}
96+
}
97+
return distanceVector[targetLength];
98+
}
99+
100+
/**
101+
* Computes the Levenshtein ratio between two given strings.
102+
*
103+
* @param referenceString First string.
104+
* @param targetString Second string.
105+
* @return The ratio of similarities between the two strings.
106+
*/
107+
public static double levenshteinRatio(String referenceString, String targetString) {
108+
int referenceLength = referenceString.length();
109+
int targetLength = targetString.length();
110+
int maxLength = Math.max(referenceLength, targetLength);
111+
112+
if (referenceLength == 0 && targetLength == 0) {
113+
return 1.0;
114+
}
115+
116+
return 1.0 - (double) levenshteinDistance(referenceString, targetString) / maxLength;
117+
}
118+
27119
}
28120

src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.mindee.extraction;
22

3+
import static com.mindee.TestingUtilities.levenshteinRatio;
4+
35
import com.mindee.MindeeClient;
46
import com.mindee.MindeeException;
57
import com.mindee.TestingUtilities;
@@ -10,7 +12,6 @@
1012
import com.mindee.product.invoice.InvoiceV4;
1113
import com.mindee.product.invoicesplitter.InvoiceSplitterV1;
1214
import java.io.IOException;
13-
import java.nio.charset.StandardCharsets;
1415
import java.nio.file.Files;
1516
import java.nio.file.Paths;
1617
import java.util.List;
@@ -70,15 +71,28 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, Interrup
7071
Assertions.assertEquals(2, extractedPDFsStrict.size());
7172
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename());
7273
Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename());
73-
PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(extractedPDFsStrict.get(0).asInputSource());
74-
75-
String testStringRSTInvoice0 = prepareInvoiceReturn("src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst", invoice0.getDocument());
76-
Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"), invoice0.getDocument().toString().split(System.lineSeparator())));
77-
78-
PredictResponse<InvoiceV4> invoice1 = getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource());
79-
80-
String testStringRSTInvoice1 = prepareInvoiceReturn("src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst", invoice1.getDocument());
81-
Assertions.assertEquals(testStringRSTInvoice1, String.join(String.format("%n"), invoice1.getDocument().toString().split(System.lineSeparator())));
74+
PredictResponse<InvoiceV4> invoice0 =
75+
getInvoicePrediction(extractedPDFsStrict.get(0).asInputSource());
76+
77+
String testStringRSTInvoice0 = prepareInvoiceReturn(
78+
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst",
79+
invoice0.getDocument());
80+
Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"),
81+
invoice0.getDocument().toString().split(System.lineSeparator())));
82+
83+
PredictResponse<InvoiceV4> invoice1 =
84+
getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource());
85+
86+
String testStringRSTInvoice1 = prepareInvoiceReturn(
87+
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst",
88+
invoice1.getDocument());
89+
Assertions.assertTrue(
90+
levenshteinRatio(
91+
testStringRSTInvoice1,
92+
String.join(String.format("%n"),
93+
invoice1.getDocument().toString().split(System.lineSeparator())
94+
)
95+
) > 0.97);
8296

8397

8498
}

0 commit comments

Comments
 (0)