diff --git a/document-ai/pom.xml b/document-ai/pom.xml
new file mode 100644
index 00000000000..3cd45cc7ac9
--- /dev/null
+++ b/document-ai/pom.xml
@@ -0,0 +1,64 @@
+
+
+ 4.0.0
+ com.example.documentai
+ documentai-snippets
+ jar
+ Google Document AI Snippets
+ https://github.com/GoogleCloudPlatform/java-docs-samples/tree/main/document-ai
+
+
+
+ com.google.cloud.samples
+ shared-configuration
+ 1.2.0
+
+
+
+ 1.8
+ 1.8
+ UTF-8
+
+
+
+
+
+
+
+ com.google.cloud
+ libraries-bom
+ 26.1.3
+ pom
+ import
+
+
+
+
+
+
+ com.google.cloud
+ google-cloud-document-ai
+ 2.7.5
+
+
+
+ com.google.cloud
+ google-cloud-storage
+
+
+ junit
+ junit
+ 4.13.2
+ test
+
+
+ com.google.truth
+ truth
+ 1.1.3
+ test
+
+
+
diff --git a/document-ai/resources/document_quality_poor.pdf b/document-ai/resources/document_quality_poor.pdf
new file mode 100644
index 00000000000..3a34a925c04
Binary files /dev/null and b/document-ai/resources/document_quality_poor.pdf differ
diff --git a/document-ai/resources/handwritten_form.pdf b/document-ai/resources/handwritten_form.pdf
new file mode 100644
index 00000000000..2189ffffd00
Binary files /dev/null and b/document-ai/resources/handwritten_form.pdf differ
diff --git a/document-ai/resources/invoice.pdf b/document-ai/resources/invoice.pdf
new file mode 100644
index 00000000000..7722734a430
Binary files /dev/null and b/document-ai/resources/invoice.pdf differ
diff --git a/document-ai/resources/multi_document.pdf b/document-ai/resources/multi_document.pdf
new file mode 100644
index 00000000000..7ea62eb8f78
Binary files /dev/null and b/document-ai/resources/multi_document.pdf differ
diff --git a/document-ai/resources/us_driver_license.pdf b/document-ai/resources/us_driver_license.pdf
new file mode 100644
index 00000000000..f8f62d902ee
Binary files /dev/null and b/document-ai/resources/us_driver_license.pdf differ
diff --git a/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java b/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java
new file mode 100644
index 00000000000..efee05e61ec
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+// [START documentai_batch_process_document]
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.api.gax.paging.Page;
+import com.google.cloud.documentai.v1.BatchDocumentsInputConfig;
+import com.google.cloud.documentai.v1.BatchProcessMetadata;
+import com.google.cloud.documentai.v1.BatchProcessRequest;
+import com.google.cloud.documentai.v1.BatchProcessResponse;
+import com.google.cloud.documentai.v1.Document;
+import com.google.cloud.documentai.v1.DocumentOutputConfig;
+import com.google.cloud.documentai.v1.DocumentOutputConfig.GcsOutputConfig;
+import com.google.cloud.documentai.v1.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1.GcsDocument;
+import com.google.cloud.documentai.v1.GcsDocuments;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.BlobId;
+import com.google.cloud.storage.Bucket;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import com.google.protobuf.util.JsonFormat;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+public class BatchProcessDocument {
+ public static void batchProcessDocument()
+ throws IOException, InterruptedException, TimeoutException, ExecutionException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String outputGcsBucketName = "your-gcs-bucket-name";
+ String outputGcsPrefix = "PREFIX";
+ String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.pdf";
+ batchProcessDocument(
+ projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix);
+ }
+
+ public static void batchProcessDocument(
+ String projectId,
+ String location,
+ String processorId,
+ String gcsInputUri,
+ String gcsOutputBucketName,
+ String gcsOutputUriPrefix)
+ throws IOException, InterruptedException, TimeoutException, ExecutionException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ GcsDocument gcsDocument =
+ GcsDocument.newBuilder().setGcsUri(gcsInputUri).setMimeType("application/pdf").build();
+
+ GcsDocuments gcsDocuments = GcsDocuments.newBuilder().addDocuments(gcsDocument).build();
+
+ BatchDocumentsInputConfig inputConfig =
+ BatchDocumentsInputConfig.newBuilder().setGcsDocuments(gcsDocuments).build();
+
+ String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix);
+ GcsOutputConfig gcsOutputConfig = GcsOutputConfig.newBuilder().setGcsUri(fullGcsPath).build();
+
+ DocumentOutputConfig documentOutputConfig =
+ DocumentOutputConfig.newBuilder().setGcsOutputConfig(gcsOutputConfig).build();
+
+ // Configure the batch process request.
+ BatchProcessRequest request =
+ BatchProcessRequest.newBuilder()
+ .setName(name)
+ .setInputDocuments(inputConfig)
+ .setDocumentOutputConfig(documentOutputConfig)
+ .build();
+
+ OperationFuture future =
+ client.batchProcessDocumentsAsync(request);
+
+ // Batch process document using a long-running operation.
+ // You can wait for now, or get results later.
+ // Note: first request to the service takes longer than subsequent
+ // requests.
+ System.out.println("Waiting for operation to complete...");
+ future.get(240, TimeUnit.SECONDS);
+
+ System.out.println("Document processing complete.");
+
+ Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
+ Bucket bucket = storage.get(gcsOutputBucketName);
+
+ // List all of the files in the Storage bucket.
+ Page blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/"));
+ int idx = 0;
+ for (Blob blob : blobs.iterateAll()) {
+ if (!blob.isDirectory()) {
+ System.out.printf("Fetched file #%d\n", ++idx);
+ // Read the results
+
+ // Download and store json data in a temp file.
+ File tempFile = File.createTempFile("file", ".json");
+ Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName()));
+ fileInfo.downloadTo(tempFile.toPath());
+
+ // Parse json file into Document.
+ FileReader reader = new FileReader(tempFile);
+ Document.Builder builder = Document.newBuilder();
+ JsonFormat.parser().merge(reader, builder);
+
+ Document document = builder.build();
+
+ // Get all of the document text as one big string.
+ String text = document.getText();
+
+ // Read the text recognition output from the processor
+ System.out.println("The document contains the following paragraphs:");
+ Document.Page page1 = document.getPages(0);
+ List paragraphList = page1.getParagraphsList();
+ for (Document.Page.Paragraph paragraph : paragraphList) {
+ String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+ System.out.printf("Paragraph text:%s\n", paragraphText);
+ }
+
+ // Form parsing provides additional output about
+ // form-formatted PDFs. You must create a form
+ // processor in the Cloud Console to see full field details.
+ System.out.println("The following form key/value pairs were detected:");
+
+ for (Document.Page.FormField field : page1.getFormFieldsList()) {
+ String fieldName = getText(field.getFieldName().getTextAnchor(), text);
+ String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
+
+ System.out.println("Extracted form fields pair:");
+ System.out.printf("\t(%s, %s))", fieldName, fieldValue);
+ }
+
+ // Clean up temp file.
+ tempFile.deleteOnExit();
+ }
+ }
+ }
+ }
+
+ // Extract shards from the text field
+ private static String getText(Document.TextAnchor textAnchor, String text) {
+ if (textAnchor.getTextSegmentsList().size() > 0) {
+ int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+ int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+ return text.substring(startIdx, endIdx);
+ }
+ return "[NO TEXT]";
+ }
+}
+// [END documentai_batch_process_document]
diff --git a/document-ai/src/main/java/documentai/v1/ProcessDocument.java b/document-ai/src/main/java/documentai/v1/ProcessDocument.java
new file mode 100644
index 00000000000..75a5c639183
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1/ProcessDocument.java
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+// [START documentai_process_document]
+
+import com.google.cloud.documentai.v1.Document;
+import com.google.cloud.documentai.v1.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1.ProcessRequest;
+import com.google.cloud.documentai.v1.ProcessResponse;
+import com.google.cloud.documentai.v1.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessDocument {
+ public static void processDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ // Get all of the document text as one big string
+ String text = documentResponse.getText();
+
+ // Read the text recognition output from the processor
+ System.out.println("The document contains the following paragraphs:");
+ Document.Page firstPage = documentResponse.getPages(0);
+ List paragraphs = firstPage.getParagraphsList();
+
+ for (Document.Page.Paragraph paragraph : paragraphs) {
+ String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+ System.out.printf("Paragraph text:\n%s\n", paragraphText);
+ }
+
+ // Form parsing provides additional output about
+ // form-formatted PDFs. You must create a form
+ // processor in the Cloud Console to see full field details.
+ System.out.println("The following form key/value pairs were detected:");
+
+ for (Document.Page.FormField field : firstPage.getFormFieldsList()) {
+ String fieldName = getText(field.getFieldName().getTextAnchor(), text);
+ String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
+
+ System.out.println("Extracted form fields pair:");
+ System.out.printf("\t(%s, %s))\n", fieldName, fieldValue);
+ }
+ }
+ }
+
+ // Extract shards from the text field
+ private static String getText(Document.TextAnchor textAnchor, String text) {
+ if (textAnchor.getTextSegmentsList().size() > 0) {
+ int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+ int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+ return text.substring(startIdx, endIdx);
+ }
+ return "[NO TEXT]";
+ }
+}
+// [END documentai_process_document]
diff --git a/document-ai/src/main/java/documentai/v1/QuickStart.java b/document-ai/src/main/java/documentai/v1/QuickStart.java
new file mode 100644
index 00000000000..88f22136a5f
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1/QuickStart.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+// [START documentai_quickstart]
+import com.google.cloud.documentai.v1.Document;
+import com.google.cloud.documentai.v1.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1.ProcessRequest;
+import com.google.cloud.documentai.v1.ProcessResponse;
+import com.google.cloud.documentai.v1.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class QuickStart {
+ public static void main(String[] args)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processorId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ quickStart(projectId, location, processorId, filePath);
+ }
+
+ public static void quickStart(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ // Get all of the document text as one big string
+ String text = documentResponse.getText();
+
+ // Read the text recognition output from the processor
+ System.out.println("The document contains the following paragraphs:");
+ Document.Page firstPage = documentResponse.getPages(0);
+ List paragraphs = firstPage.getParagraphsList();
+
+ for (Document.Page.Paragraph paragraph : paragraphs) {
+ String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+ System.out.printf("Paragraph text:\n%s\n", paragraphText);
+ }
+ }
+ }
+
+ // Extract shards from the text field
+ private static String getText(Document.TextAnchor textAnchor, String text) {
+ if (textAnchor.getTextSegmentsList().size() > 0) {
+ int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+ int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+ return text.substring(startIdx, endIdx);
+ }
+ return "[NO TEXT]";
+ }
+}
+// [END documentai_quickstart]
diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java
new file mode 100644
index 00000000000..8a50d8533c6
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_form_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.cloud.documentai.v1beta3.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessFormDocument {
+ public static void processFormDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processFormDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processFormDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ System.out.println("Document processing complete.");
+
+ // Read the text recognition output from the processor
+ // For a full list of Document object attributes,
+ // please reference this page:
+ // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html
+
+ // Get all of the document text as one big string
+ String text = documentResponse.getText();
+ System.out.printf("Full document text: '%s'\n", removeNewlines(text));
+
+ // Read the text recognition output from the processor
+ List pages = documentResponse.getPagesList();
+ System.out.printf("There are %s page(s) in this document.\n", pages.size());
+
+ for (Document.Page page : pages) {
+ System.out.printf("\n\n**** Page %d ****\n", page.getPageNumber());
+
+ List tables = page.getTablesList();
+ System.out.printf("Found %d table(s):\n", tables.size());
+ for (Document.Page.Table table : tables) {
+ printTableInfo(table, text);
+ }
+
+ List formFields = page.getFormFieldsList();
+ System.out.printf("Found %d form fields:\n", formFields.size());
+ for (Document.Page.FormField formField : formFields) {
+ String fieldName = getLayoutText(formField.getFieldName().getTextAnchor(), text);
+ String fieldValue = getLayoutText(formField.getFieldValue().getTextAnchor(), text);
+ System.out.printf(
+ " * '%s': '%s'\n", removeNewlines(fieldName), removeNewlines(fieldValue));
+ }
+ }
+ }
+ }
+
+ private static void printTableInfo(Document.Page.Table table, String text) {
+ Document.Page.Table.TableRow firstBodyRow = table.getBodyRows(0);
+ int columnCount = firstBodyRow.getCellsCount();
+ System.out.printf(
+ " Table with %d columns and %d rows:\n", columnCount, table.getBodyRowsCount());
+
+ Document.Page.Table.TableRow headerRow = table.getHeaderRows(0);
+ StringBuilder headerRowText = new StringBuilder();
+ for (Document.Page.Table.TableCell cell : headerRow.getCellsList()) {
+ String columnName = getLayoutText(cell.getLayout().getTextAnchor(), text);
+ headerRowText.append(String.format("%s | ", removeNewlines(columnName)));
+ }
+ headerRowText.setLength(headerRowText.length() - 3);
+ System.out.printf(" Collumns: %s\n", headerRowText.toString());
+
+ StringBuilder firstRowText = new StringBuilder();
+ for (Document.Page.Table.TableCell cell : firstBodyRow.getCellsList()) {
+ String cellText = getLayoutText(cell.getLayout().getTextAnchor(), text);
+ firstRowText.append(String.format("%s | ", removeNewlines(cellText)));
+ }
+ firstRowText.setLength(firstRowText.length() - 3);
+ System.out.printf(" First row data: %s\n", firstRowText.toString());
+ }
+
+ // Extract shards from the text field
+ private static String getLayoutText(Document.TextAnchor textAnchor, String text) {
+ if (textAnchor.getTextSegmentsList().size() > 0) {
+ int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+ int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+ return text.substring(startIdx, endIdx);
+ }
+ return "[NO TEXT]";
+ }
+
+ private static String removeNewlines(String s) {
+ return s.replace("\n", "").replace("\r", "");
+ }
+}
+// [END documentai_process_form_document]
diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java
new file mode 100644
index 00000000000..f483929a13e
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_ocr_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.cloud.documentai.v1beta3.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessOcrDocument {
+ public static void processOcrDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processOcrDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processOcrDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ System.out.println("Document processing complete.");
+
+ // Read the text recognition output from the processor
+ // For a full list of Document object attributes,
+ // please reference this page:
+ // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html
+
+ // Get all of the document text as one big string
+ String text = documentResponse.getText();
+ System.out.printf("Full document text: '%s'\n", escapeNewlines(text));
+
+ // Read the text recognition output from the processor
+ List pages = documentResponse.getPagesList();
+ System.out.printf("There are %s page(s) in this document.\n", pages.size());
+
+ for (Document.Page page : pages) {
+ System.out.printf("Page %d:\n", page.getPageNumber());
+ printPageDimensions(page.getDimension());
+ printDetectedLanguages(page.getDetectedLanguagesList());
+ printParagraphs(page.getParagraphsList(), text);
+ printBlocks(page.getBlocksList(), text);
+ printLines(page.getLinesList(), text);
+ printTokens(page.getTokensList(), text);
+ }
+ }
+ }
+
+ private static void printPageDimensions(Document.Page.Dimension dimension) {
+ String unit = dimension.getUnit();
+ System.out.printf(" Width: %.1f %s\n", dimension.getWidth(), unit);
+ System.out.printf(" Height: %.1f %s\n", dimension.getHeight(), unit);
+ }
+
+ private static void printDetectedLanguages(
+ List detectedLangauges) {
+ System.out.println(" Detected languages:");
+ for (Document.Page.DetectedLanguage detectedLanguage : detectedLangauges) {
+ String languageCode = detectedLanguage.getLanguageCode();
+ float confidence = detectedLanguage.getConfidence();
+ System.out.printf(" %s (%.2f%%)\n", languageCode, confidence * 100.0);
+ }
+ }
+
+ private static void printParagraphs(List paragraphs, String text) {
+ System.out.printf(" %d paragraphs detected:\n", paragraphs.size());
+ Document.Page.Paragraph firstParagraph = paragraphs.get(0);
+ String firstParagraphText = getLayoutText(firstParagraph.getLayout().getTextAnchor(), text);
+ System.out.printf(" First paragraph text: %s\n", escapeNewlines(firstParagraphText));
+ Document.Page.Paragraph lastParagraph = paragraphs.get(paragraphs.size() - 1);
+ String lastParagraphText = getLayoutText(lastParagraph.getLayout().getTextAnchor(), text);
+ System.out.printf(" Last paragraph text: %s\n", escapeNewlines(lastParagraphText));
+ }
+
+ private static void printBlocks(List blocks, String text) {
+ System.out.printf(" %d blocks detected:\n", blocks.size());
+ Document.Page.Block firstBlock = blocks.get(0);
+ String firstBlockText = getLayoutText(firstBlock.getLayout().getTextAnchor(), text);
+ System.out.printf(" First block text: %s\n", escapeNewlines(firstBlockText));
+ Document.Page.Block lastBlock = blocks.get(blocks.size() - 1);
+ String lastBlockText = getLayoutText(lastBlock.getLayout().getTextAnchor(), text);
+ System.out.printf(" Last block text: %s\n", escapeNewlines(lastBlockText));
+ }
+
+ private static void printLines(List lines, String text) {
+ System.out.printf(" %d lines detected:\n", lines.size());
+ Document.Page.Line firstLine = lines.get(0);
+ String firstLineText = getLayoutText(firstLine.getLayout().getTextAnchor(), text);
+ System.out.printf(" First line text: %s\n", escapeNewlines(firstLineText));
+ Document.Page.Line lastLine = lines.get(lines.size() - 1);
+ String lastLineText = getLayoutText(lastLine.getLayout().getTextAnchor(), text);
+ System.out.printf(" Last line text: %s\n", escapeNewlines(lastLineText));
+ }
+
+ private static void printTokens(List tokens, String text) {
+ System.out.printf(" %d tokens detected:\n", tokens.size());
+ Document.Page.Token firstToken = tokens.get(0);
+ String firstTokenText = getLayoutText(firstToken.getLayout().getTextAnchor(), text);
+ System.out.printf(" First token text: %s\n", escapeNewlines(firstTokenText));
+ Document.Page.Token lastToken = tokens.get(tokens.size() - 1);
+ String lastTokenText = getLayoutText(lastToken.getLayout().getTextAnchor(), text);
+ System.out.printf(" Last token text: %s\n", escapeNewlines(lastTokenText));
+ }
+
+ // Extract shards from the text field
+ private static String getLayoutText(Document.TextAnchor textAnchor, String text) {
+ if (textAnchor.getTextSegmentsList().size() > 0) {
+ int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+ int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+ return text.substring(startIdx, endIdx);
+ }
+ return "[NO TEXT]";
+ }
+
+ private static String escapeNewlines(String s) {
+ return s.replace("\n", "\\n").replace("\r", "\\r");
+ }
+}
+// [END documentai_process_ocr_document]
diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java
new file mode 100644
index 00000000000..3e80a574f72
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_quality_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.cloud.documentai.v1beta3.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessQualityDocument {
+ public static void processQualityDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processQualityDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processQualityDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ System.out.println("Document processing complete.");
+
+ // Read the quality-specific information from the output from the
+ // Intelligent Document Quality Processor:
+ // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-quality-processor
+ // OCR and other data is also present in the quality processor's response.
+ // Please see the OCR and other samples for how to parse other data in the
+ // response.
+ List entities = documentResponse.getEntitiesList();
+ for (Document.Entity entity : entities) {
+ float entityConfidence = entity.getConfidence();
+ long pageNumber = entity.getPageAnchor().getPageRefs(0).getPage() + 1;
+ System.out.printf(
+ "Page %d has a quality score of (%.2f%%):\n", pageNumber, entityConfidence * 100.0);
+ for (Document.Entity property : entity.getPropertiesList()) {
+ float propertyConfidence = property.getConfidence();
+ String propertyType = property.getType();
+ System.out.printf(" * %s score of %.2f%%\n", propertyType, propertyConfidence * 100.0);
+ }
+ }
+ }
+ }
+}
+// [END documentai_process_quality_document]
diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java
new file mode 100644
index 00000000000..5cbb1af107c
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_specialized_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.cloud.documentai.v1beta3.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessSpecializedDocument {
+ public static void processSpecializedDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processSpecializedDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processSpecializedDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ System.out.println("Document processing complete.");
+
+ // Read fields specificly from the specalized US drivers license processor:
+ // https://cloud.google.com/document-ai/docs/processors-list#processor_us-driver-license-parser
+ // retriving data from other specalized processors follow a similar pattern.
+ // For a complete list of processors see:
+ // https://cloud.google.com/document-ai/docs/processors-list
+ //
+ // OCR and other data is also present in the quality processor's response.
+ // Please see the OCR and other samples for how to parse other data in the
+ // response.
+ for (Document.Entity entity : documentResponse.getEntitiesList()) {
+ // Fields detected. For a full list of fields for each processor see
+ // the processor documentation:
+ // https://cloud.google.com/document-ai/docs/processors-list
+ String entityType = entity.getType();
+ // some other value formats in addition to text are availible
+ // e.g. dates: `entity.getNormalizedValue().getDateValue().getYear()`
+ // check for normilized value with `entity.hasNormalizedValue()`
+ String entityTextValue = escapeNewlines(entity.getTextAnchor().getContent());
+ float entityConfidence = entity.getConfidence();
+ System.out.printf(
+ " * %s: %s (%.2f%% confident)\n",
+ entityType, entityTextValue, entityConfidence * 100.0);
+ }
+ }
+ }
+
+ private static String escapeNewlines(String s) {
+ return s.replace("\n", "\\n").replace("\r", "\\r");
+ }
+}
+// [END documentai_process_specialized_document]
diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java
new file mode 100644
index 00000000000..e63e2f8e4cf
--- /dev/null
+++ b/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_splitter_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.cloud.documentai.v1beta3.RawDocument;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessSplitterDocument {
+ public static void processSplitterDocument()
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // TODO(developer): Replace these variables before running the sample.
+ String projectId = "your-project-id";
+ String location = "your-project-location"; // Format is "us" or "eu".
+ String processerId = "your-processor-id";
+ String filePath = "path/to/input/file.pdf";
+ processSplitterDocument(projectId, location, processerId, filePath);
+ }
+
+ public static void processSplitterDocument(
+ String projectId, String location, String processorId, String filePath)
+ throws IOException, InterruptedException, ExecutionException, TimeoutException {
+ // Initialize client that will be used to send requests. This client only needs to be created
+ // once, and can be reused for multiple requests. After completing all of your requests, call
+ // the "close" method on the client to safely clean up any remaining background resources.
+ try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+ // The full resource name of the processor, e.g.:
+ // projects/project-id/locations/location/processor/processor-id
+ // You must create new processors in the Cloud Console first
+ String name =
+ String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+ // Read the file.
+ byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+ // Convert the image data to a Buffer and base64 encode it.
+ ByteString content = ByteString.copyFrom(imageFileData);
+
+ RawDocument document =
+ RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+ // Configure the process request.
+ ProcessRequest request =
+ ProcessRequest.newBuilder().setName(name).setRawDocument(document).build();
+
+ // Recognizes text entities in the PDF document
+ ProcessResponse result = client.processDocument(request);
+ Document documentResponse = result.getDocument();
+
+ System.out.println("Document processing complete.");
+
+ // Read the splitter output from the document splitter processor:
+ // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-splitter
+ // This processor only provides text for the document and information on how
+ // to split the document on logical boundaries. To identify and extract text,
+ // form elements, and entities please see other processors like the OCR, form,
+ // and specalized processors.
+ List entities = documentResponse.getEntitiesList();
+ System.out.printf("Found %d subdocuments:\n", entities.size());
+ for (Document.Entity entity : entities) {
+ float entityConfidence = entity.getConfidence();
+ String pagesRangeText = pageRefsToString(entity.getPageAnchor().getPageRefsList());
+ String subdocumentType = entity.getType();
+ if (subdocumentType.isEmpty()) {
+ System.out.printf(
+ "%.2f%% confident that %s a subdocument.\n", entityConfidence * 100, pagesRangeText);
+ } else {
+ System.out.printf(
+ "%.2f%% confident that %s a '%s' subdocument.\n",
+ entityConfidence * 100, pagesRangeText, subdocumentType);
+ }
+ }
+ }
+ }
+
+ // Converts page reference(s) to a string describing the page or page range.
+ private static String pageRefsToString(List pageRefs) {
+ if (pageRefs.size() == 1) {
+ return String.format("page %d is", pageRefs.get(0).getPage() + 1);
+ } else {
+ long start = pageRefs.get(0).getPage() + 1;
+ long end = pageRefs.get(1).getPage() + 1;
+ return String.format("pages %d to %d are", start, end);
+ }
+ }
+}
+// [END documentai_process_splitter_document]
diff --git a/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java b/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java
new file mode 100644
index 00000000000..1024ae71fc3
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import com.google.api.gax.paging.Page;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.BucketInfo;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.UUID;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class BatchProcessDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "88541adc6eeec481";
+ private static final String BUCKET_NAME =
+ String.format("document-ai-output-test-%s", UUID.randomUUID());
+ private static final String INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf";
+ private static final String OUTPUT_PREFIX = String.format("%s", UUID.randomUUID());
+ private static final String OUTPUT_BUCKET_NAME = PROJECT_ID;
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ private static void cleanUpBucket() {
+ Storage storage = StorageOptions.getDefaultInstance().getService();
+ Page blobs =
+ storage.list(
+ BUCKET_NAME,
+ Storage.BlobListOption.currentDirectory(),
+ Storage.BlobListOption.prefix(OUTPUT_PREFIX));
+
+ deleteDirectory(storage, blobs);
+ }
+
+ private static void deleteDirectory(Storage storage, Page blobs) {
+ for (Blob blob : blobs.iterateAll()) {
+ if (!blob.delete()) {
+ Page subBlobs =
+ storage.list(
+ BUCKET_NAME,
+ Storage.BlobListOption.currentDirectory(),
+ Storage.BlobListOption.prefix(blob.getName()));
+
+ deleteDirectory(storage, subBlobs);
+ }
+ }
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+
+ Storage storage = StorageOptions.getDefaultInstance().getService();
+ storage.create(BucketInfo.of(BUCKET_NAME));
+ }
+
+ @Test
+ public void testBatchProcessDocument()
+ throws InterruptedException, ExecutionException, TimeoutException, IOException {
+ // parse the GCS invoice as a form.
+ BatchProcessDocument.batchProcessDocument(
+ PROJECT_ID, "us", PROCESSOR_ID, INPUT_URI, OUTPUT_BUCKET_NAME, OUTPUT_PREFIX);
+ String got = bout.toString();
+
+ assertThat(got).contains("Paragraph text:");
+ assertThat(got).contains("Extracted");
+ }
+
+ @After
+ public void tearDown() {
+ cleanUpBucket();
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java b/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java
new file mode 100644
index 00000000000..6a4a35aa9eb
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "88541adc6eeec481";
+ private static final String FILE_PATH = "resources/invoice.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessDocument.processDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Paragraph text:");
+ assertThat(got).contains("Extracted");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1/QuickStartTest.java b/document-ai/src/test/java/documentai/v1/QuickStartTest.java
new file mode 100644
index 00000000000..afaa4b1c7f1
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1/QuickStartTest.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class QuickStartTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "88541adc6eeec481";
+ private static final String FILE_PATH = "resources/invoice.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testQuickStart()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ QuickStart.quickStart(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Paragraph text:");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java
new file mode 100644
index 00000000000..7491d7442e7
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessFormDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "88541adc6eeec481";
+ private static final String FILE_PATH = "resources/invoice.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessFormDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessFormDocument.processFormDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("There are 1 page(s) in this document.");
+ assertThat(got).contains("Table with 4 columns and 6 rows");
+ assertThat(got).contains("Found 13 form fields");
+ assertThat(got).contains("'BALANCE DUE': '$2140.00'");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java
new file mode 100644
index 00000000000..0c2da47156b
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessOcrDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "f9018d35bc5edc1e";
+ private static final String FILE_PATH = "resources/handwritten_form.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessOcrDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessOcrDocument.processOcrDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Page 1");
+ assertThat(got).contains("en");
+ assertThat(got).contains("FakeDoc");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java
new file mode 100644
index 00000000000..7379dbf0f30
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessQualityDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "f80f55e03d4c20ed";
+ private static final String FILE_PATH = "resources/document_quality_poor.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessQualityDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessQualityDocument.processQualityDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Page 1 has a quality score of");
+ assertThat(got).contains("defect_blurry score of 9");
+ assertThat(got).contains("defect_noisy");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java
new file mode 100644
index 00000000000..5f5b21d078d
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessSpecializedDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "ae8bc99f01b36b5e";
+ private static final String FILE_PATH = "resources/us_driver_license.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessSpecializedDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessSpecializedDocument.processSpecializedDocument(
+ PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Document Id");
+ assertThat(got).contains("97551579");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}
diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java
new file mode 100644
index 00000000000..8fcf7aafb6f
--- /dev/null
+++ b/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ProcessSplitterDocumentTest {
+ private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT");
+ private static final String PROCESSOR_ID = "7cb010d65184a4d";
+ private static final String FILE_PATH = "resources/multi_document.pdf";
+
+ private ByteArrayOutputStream bout;
+ private PrintStream out;
+ private PrintStream originalPrintStream;
+
+ private static void requireEnvVar(String varName) {
+ assertNotNull(
+ String.format("Environment variable '%s' must be set to perform these tests.", varName),
+ System.getenv(varName));
+ }
+
+ @Before
+ public void checkRequirements() {
+ requireEnvVar("GOOGLE_CLOUD_PROJECT");
+ requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS");
+ }
+
+ @Before
+ public void setUp() {
+ bout = new ByteArrayOutputStream();
+ out = new PrintStream(bout);
+ originalPrintStream = System.out;
+ System.setOut(out);
+ }
+
+ @Test
+ public void testProcessSplitterDocument()
+ throws InterruptedException, ExecutionException, IOException, TimeoutException {
+ // parse the GCS invoice as a form.
+ ProcessSplitterDocument.processSplitterDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH);
+ String got = bout.toString();
+
+ assertThat(got).contains("Found 8 subdocuments");
+ assertThat(got).contains("confident that pages 1 to 2 are a subdocument");
+ assertThat(got).contains("confident that page 10 is a subdocument");
+ }
+
+ @After
+ public void tearDown() {
+ System.out.flush();
+ System.setOut(originalPrintStream);
+ }
+}