diff --git a/document-ai/pom.xml b/document-ai/pom.xml new file mode 100644 index 00000000000..3cd45cc7ac9 --- /dev/null +++ b/document-ai/pom.xml @@ -0,0 +1,64 @@ + + + 4.0.0 + com.example.documentai + documentai-snippets + jar + Google Document AI Snippets + https://github.com/GoogleCloudPlatform/java-docs-samples/tree/main/document-ai + + + + com.google.cloud.samples + shared-configuration + 1.2.0 + + + + 1.8 + 1.8 + UTF-8 + + + + + + + + com.google.cloud + libraries-bom + 26.1.3 + pom + import + + + + + + + com.google.cloud + google-cloud-document-ai + 2.7.5 + + + + com.google.cloud + google-cloud-storage + + + junit + junit + 4.13.2 + test + + + com.google.truth + truth + 1.1.3 + test + + + diff --git a/document-ai/resources/document_quality_poor.pdf b/document-ai/resources/document_quality_poor.pdf new file mode 100644 index 00000000000..3a34a925c04 Binary files /dev/null and b/document-ai/resources/document_quality_poor.pdf differ diff --git a/document-ai/resources/handwritten_form.pdf b/document-ai/resources/handwritten_form.pdf new file mode 100644 index 00000000000..2189ffffd00 Binary files /dev/null and b/document-ai/resources/handwritten_form.pdf differ diff --git a/document-ai/resources/invoice.pdf b/document-ai/resources/invoice.pdf new file mode 100644 index 00000000000..7722734a430 Binary files /dev/null and b/document-ai/resources/invoice.pdf differ diff --git a/document-ai/resources/multi_document.pdf b/document-ai/resources/multi_document.pdf new file mode 100644 index 00000000000..7ea62eb8f78 Binary files /dev/null and b/document-ai/resources/multi_document.pdf differ diff --git a/document-ai/resources/us_driver_license.pdf b/document-ai/resources/us_driver_license.pdf new file mode 100644 index 00000000000..f8f62d902ee Binary files /dev/null and b/document-ai/resources/us_driver_license.pdf differ diff --git a/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java b/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java new file mode 100644 index 00000000000..efee05e61ec --- /dev/null +++ b/document-ai/src/main/java/documentai/v1/BatchProcessDocument.java @@ -0,0 +1,178 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +// [START documentai_batch_process_document] + +import com.google.api.gax.longrunning.OperationFuture; +import com.google.api.gax.paging.Page; +import com.google.cloud.documentai.v1.BatchDocumentsInputConfig; +import com.google.cloud.documentai.v1.BatchProcessMetadata; +import com.google.cloud.documentai.v1.BatchProcessRequest; +import com.google.cloud.documentai.v1.BatchProcessResponse; +import com.google.cloud.documentai.v1.Document; +import com.google.cloud.documentai.v1.DocumentOutputConfig; +import com.google.cloud.documentai.v1.DocumentOutputConfig.GcsOutputConfig; +import com.google.cloud.documentai.v1.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1.GcsDocument; +import com.google.cloud.documentai.v1.GcsDocuments; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import com.google.protobuf.util.JsonFormat; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +public class BatchProcessDocument { + public static void batchProcessDocument() + throws IOException, InterruptedException, TimeoutException, ExecutionException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String outputGcsBucketName = "your-gcs-bucket-name"; + String outputGcsPrefix = "PREFIX"; + String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.pdf"; + batchProcessDocument( + projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix); + } + + public static void batchProcessDocument( + String projectId, + String location, + String processorId, + String gcsInputUri, + String gcsOutputBucketName, + String gcsOutputUriPrefix) + throws IOException, InterruptedException, TimeoutException, ExecutionException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + GcsDocument gcsDocument = + GcsDocument.newBuilder().setGcsUri(gcsInputUri).setMimeType("application/pdf").build(); + + GcsDocuments gcsDocuments = GcsDocuments.newBuilder().addDocuments(gcsDocument).build(); + + BatchDocumentsInputConfig inputConfig = + BatchDocumentsInputConfig.newBuilder().setGcsDocuments(gcsDocuments).build(); + + String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix); + GcsOutputConfig gcsOutputConfig = GcsOutputConfig.newBuilder().setGcsUri(fullGcsPath).build(); + + DocumentOutputConfig documentOutputConfig = + DocumentOutputConfig.newBuilder().setGcsOutputConfig(gcsOutputConfig).build(); + + // Configure the batch process request. + BatchProcessRequest request = + BatchProcessRequest.newBuilder() + .setName(name) + .setInputDocuments(inputConfig) + .setDocumentOutputConfig(documentOutputConfig) + .build(); + + OperationFuture future = + client.batchProcessDocumentsAsync(request); + + // Batch process document using a long-running operation. + // You can wait for now, or get results later. + // Note: first request to the service takes longer than subsequent + // requests. + System.out.println("Waiting for operation to complete..."); + future.get(240, TimeUnit.SECONDS); + + System.out.println("Document processing complete."); + + Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService(); + Bucket bucket = storage.get(gcsOutputBucketName); + + // List all of the files in the Storage bucket. + Page blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/")); + int idx = 0; + for (Blob blob : blobs.iterateAll()) { + if (!blob.isDirectory()) { + System.out.printf("Fetched file #%d\n", ++idx); + // Read the results + + // Download and store json data in a temp file. + File tempFile = File.createTempFile("file", ".json"); + Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName())); + fileInfo.downloadTo(tempFile.toPath()); + + // Parse json file into Document. + FileReader reader = new FileReader(tempFile); + Document.Builder builder = Document.newBuilder(); + JsonFormat.parser().merge(reader, builder); + + Document document = builder.build(); + + // Get all of the document text as one big string. + String text = document.getText(); + + // Read the text recognition output from the processor + System.out.println("The document contains the following paragraphs:"); + Document.Page page1 = document.getPages(0); + List paragraphList = page1.getParagraphsList(); + for (Document.Page.Paragraph paragraph : paragraphList) { + String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text); + System.out.printf("Paragraph text:%s\n", paragraphText); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + System.out.println("The following form key/value pairs were detected:"); + + for (Document.Page.FormField field : page1.getFormFieldsList()) { + String fieldName = getText(field.getFieldName().getTextAnchor(), text); + String fieldValue = getText(field.getFieldValue().getTextAnchor(), text); + + System.out.println("Extracted form fields pair:"); + System.out.printf("\t(%s, %s))", fieldName, fieldValue); + } + + // Clean up temp file. + tempFile.deleteOnExit(); + } + } + } + } + + // Extract shards from the text field + private static String getText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } +} +// [END documentai_batch_process_document] diff --git a/document-ai/src/main/java/documentai/v1/ProcessDocument.java b/document-ai/src/main/java/documentai/v1/ProcessDocument.java new file mode 100644 index 00000000000..75a5c639183 --- /dev/null +++ b/document-ai/src/main/java/documentai/v1/ProcessDocument.java @@ -0,0 +1,113 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +// [START documentai_process_document] + +import com.google.cloud.documentai.v1.Document; +import com.google.cloud.documentai.v1.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1.ProcessRequest; +import com.google.cloud.documentai.v1.ProcessResponse; +import com.google.cloud.documentai.v1.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessDocument { + public static void processDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processDocument(projectId, location, processerId, filePath); + } + + public static void processDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + // Get all of the document text as one big string + String text = documentResponse.getText(); + + // Read the text recognition output from the processor + System.out.println("The document contains the following paragraphs:"); + Document.Page firstPage = documentResponse.getPages(0); + List paragraphs = firstPage.getParagraphsList(); + + for (Document.Page.Paragraph paragraph : paragraphs) { + String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text); + System.out.printf("Paragraph text:\n%s\n", paragraphText); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + System.out.println("The following form key/value pairs were detected:"); + + for (Document.Page.FormField field : firstPage.getFormFieldsList()) { + String fieldName = getText(field.getFieldName().getTextAnchor(), text); + String fieldValue = getText(field.getFieldValue().getTextAnchor(), text); + + System.out.println("Extracted form fields pair:"); + System.out.printf("\t(%s, %s))\n", fieldName, fieldValue); + } + } + } + + // Extract shards from the text field + private static String getText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } +} +// [END documentai_process_document] diff --git a/document-ai/src/main/java/documentai/v1/QuickStart.java b/document-ai/src/main/java/documentai/v1/QuickStart.java new file mode 100644 index 00000000000..88f22136a5f --- /dev/null +++ b/document-ai/src/main/java/documentai/v1/QuickStart.java @@ -0,0 +1,99 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +// [START documentai_quickstart] +import com.google.cloud.documentai.v1.Document; +import com.google.cloud.documentai.v1.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1.ProcessRequest; +import com.google.cloud.documentai.v1.ProcessResponse; +import com.google.cloud.documentai.v1.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class QuickStart { + public static void main(String[] args) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processorId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + quickStart(projectId, location, processorId, filePath); + } + + public static void quickStart( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + // Get all of the document text as one big string + String text = documentResponse.getText(); + + // Read the text recognition output from the processor + System.out.println("The document contains the following paragraphs:"); + Document.Page firstPage = documentResponse.getPages(0); + List paragraphs = firstPage.getParagraphsList(); + + for (Document.Page.Paragraph paragraph : paragraphs) { + String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text); + System.out.printf("Paragraph text:\n%s\n", paragraphText); + } + } + } + + // Extract shards from the text field + private static String getText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } +} +// [END documentai_quickstart] diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java new file mode 100644 index 00000000000..8a50d8533c6 --- /dev/null +++ b/document-ai/src/main/java/documentai/v1beta3/ProcessFormDocument.java @@ -0,0 +1,149 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_form_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessFormDocument { + public static void processFormDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processFormDocument(projectId, location, processerId, filePath); + } + + public static void processFormDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the text recognition output from the processor + // For a full list of Document object attributes, + // please reference this page: + // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html + + // Get all of the document text as one big string + String text = documentResponse.getText(); + System.out.printf("Full document text: '%s'\n", removeNewlines(text)); + + // Read the text recognition output from the processor + List pages = documentResponse.getPagesList(); + System.out.printf("There are %s page(s) in this document.\n", pages.size()); + + for (Document.Page page : pages) { + System.out.printf("\n\n**** Page %d ****\n", page.getPageNumber()); + + List tables = page.getTablesList(); + System.out.printf("Found %d table(s):\n", tables.size()); + for (Document.Page.Table table : tables) { + printTableInfo(table, text); + } + + List formFields = page.getFormFieldsList(); + System.out.printf("Found %d form fields:\n", formFields.size()); + for (Document.Page.FormField formField : formFields) { + String fieldName = getLayoutText(formField.getFieldName().getTextAnchor(), text); + String fieldValue = getLayoutText(formField.getFieldValue().getTextAnchor(), text); + System.out.printf( + " * '%s': '%s'\n", removeNewlines(fieldName), removeNewlines(fieldValue)); + } + } + } + } + + private static void printTableInfo(Document.Page.Table table, String text) { + Document.Page.Table.TableRow firstBodyRow = table.getBodyRows(0); + int columnCount = firstBodyRow.getCellsCount(); + System.out.printf( + " Table with %d columns and %d rows:\n", columnCount, table.getBodyRowsCount()); + + Document.Page.Table.TableRow headerRow = table.getHeaderRows(0); + StringBuilder headerRowText = new StringBuilder(); + for (Document.Page.Table.TableCell cell : headerRow.getCellsList()) { + String columnName = getLayoutText(cell.getLayout().getTextAnchor(), text); + headerRowText.append(String.format("%s | ", removeNewlines(columnName))); + } + headerRowText.setLength(headerRowText.length() - 3); + System.out.printf(" Collumns: %s\n", headerRowText.toString()); + + StringBuilder firstRowText = new StringBuilder(); + for (Document.Page.Table.TableCell cell : firstBodyRow.getCellsList()) { + String cellText = getLayoutText(cell.getLayout().getTextAnchor(), text); + firstRowText.append(String.format("%s | ", removeNewlines(cellText))); + } + firstRowText.setLength(firstRowText.length() - 3); + System.out.printf(" First row data: %s\n", firstRowText.toString()); + } + + // Extract shards from the text field + private static String getLayoutText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } + + private static String removeNewlines(String s) { + return s.replace("\n", "").replace("\r", ""); + } +} +// [END documentai_process_form_document] diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java new file mode 100644 index 00000000000..f483929a13e --- /dev/null +++ b/document-ai/src/main/java/documentai/v1beta3/ProcessOcrDocument.java @@ -0,0 +1,172 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_ocr_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessOcrDocument { + public static void processOcrDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processOcrDocument(projectId, location, processerId, filePath); + } + + public static void processOcrDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the text recognition output from the processor + // For a full list of Document object attributes, + // please reference this page: + // https://googleapis.dev/java/google-cloud-document-ai/latest/index.html + + // Get all of the document text as one big string + String text = documentResponse.getText(); + System.out.printf("Full document text: '%s'\n", escapeNewlines(text)); + + // Read the text recognition output from the processor + List pages = documentResponse.getPagesList(); + System.out.printf("There are %s page(s) in this document.\n", pages.size()); + + for (Document.Page page : pages) { + System.out.printf("Page %d:\n", page.getPageNumber()); + printPageDimensions(page.getDimension()); + printDetectedLanguages(page.getDetectedLanguagesList()); + printParagraphs(page.getParagraphsList(), text); + printBlocks(page.getBlocksList(), text); + printLines(page.getLinesList(), text); + printTokens(page.getTokensList(), text); + } + } + } + + private static void printPageDimensions(Document.Page.Dimension dimension) { + String unit = dimension.getUnit(); + System.out.printf(" Width: %.1f %s\n", dimension.getWidth(), unit); + System.out.printf(" Height: %.1f %s\n", dimension.getHeight(), unit); + } + + private static void printDetectedLanguages( + List detectedLangauges) { + System.out.println(" Detected languages:"); + for (Document.Page.DetectedLanguage detectedLanguage : detectedLangauges) { + String languageCode = detectedLanguage.getLanguageCode(); + float confidence = detectedLanguage.getConfidence(); + System.out.printf(" %s (%.2f%%)\n", languageCode, confidence * 100.0); + } + } + + private static void printParagraphs(List paragraphs, String text) { + System.out.printf(" %d paragraphs detected:\n", paragraphs.size()); + Document.Page.Paragraph firstParagraph = paragraphs.get(0); + String firstParagraphText = getLayoutText(firstParagraph.getLayout().getTextAnchor(), text); + System.out.printf(" First paragraph text: %s\n", escapeNewlines(firstParagraphText)); + Document.Page.Paragraph lastParagraph = paragraphs.get(paragraphs.size() - 1); + String lastParagraphText = getLayoutText(lastParagraph.getLayout().getTextAnchor(), text); + System.out.printf(" Last paragraph text: %s\n", escapeNewlines(lastParagraphText)); + } + + private static void printBlocks(List blocks, String text) { + System.out.printf(" %d blocks detected:\n", blocks.size()); + Document.Page.Block firstBlock = blocks.get(0); + String firstBlockText = getLayoutText(firstBlock.getLayout().getTextAnchor(), text); + System.out.printf(" First block text: %s\n", escapeNewlines(firstBlockText)); + Document.Page.Block lastBlock = blocks.get(blocks.size() - 1); + String lastBlockText = getLayoutText(lastBlock.getLayout().getTextAnchor(), text); + System.out.printf(" Last block text: %s\n", escapeNewlines(lastBlockText)); + } + + private static void printLines(List lines, String text) { + System.out.printf(" %d lines detected:\n", lines.size()); + Document.Page.Line firstLine = lines.get(0); + String firstLineText = getLayoutText(firstLine.getLayout().getTextAnchor(), text); + System.out.printf(" First line text: %s\n", escapeNewlines(firstLineText)); + Document.Page.Line lastLine = lines.get(lines.size() - 1); + String lastLineText = getLayoutText(lastLine.getLayout().getTextAnchor(), text); + System.out.printf(" Last line text: %s\n", escapeNewlines(lastLineText)); + } + + private static void printTokens(List tokens, String text) { + System.out.printf(" %d tokens detected:\n", tokens.size()); + Document.Page.Token firstToken = tokens.get(0); + String firstTokenText = getLayoutText(firstToken.getLayout().getTextAnchor(), text); + System.out.printf(" First token text: %s\n", escapeNewlines(firstTokenText)); + Document.Page.Token lastToken = tokens.get(tokens.size() - 1); + String lastTokenText = getLayoutText(lastToken.getLayout().getTextAnchor(), text); + System.out.printf(" Last token text: %s\n", escapeNewlines(lastTokenText)); + } + + // Extract shards from the text field + private static String getLayoutText(Document.TextAnchor textAnchor, String text) { + if (textAnchor.getTextSegmentsList().size() > 0) { + int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex(); + int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex(); + return text.substring(startIdx, endIdx); + } + return "[NO TEXT]"; + } + + private static String escapeNewlines(String s) { + return s.replace("\n", "\\n").replace("\r", "\\r"); + } +} +// [END documentai_process_ocr_document] diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java new file mode 100644 index 00000000000..3e80a574f72 --- /dev/null +++ b/document-ai/src/main/java/documentai/v1beta3/ProcessQualityDocument.java @@ -0,0 +1,98 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_quality_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessQualityDocument { + public static void processQualityDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processQualityDocument(projectId, location, processerId, filePath); + } + + public static void processQualityDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the quality-specific information from the output from the + // Intelligent Document Quality Processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-quality-processor + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + List entities = documentResponse.getEntitiesList(); + for (Document.Entity entity : entities) { + float entityConfidence = entity.getConfidence(); + long pageNumber = entity.getPageAnchor().getPageRefs(0).getPage() + 1; + System.out.printf( + "Page %d has a quality score of (%.2f%%):\n", pageNumber, entityConfidence * 100.0); + for (Document.Entity property : entity.getPropertiesList()) { + float propertyConfidence = property.getConfidence(); + String propertyType = property.getType(); + System.out.printf(" * %s score of %.2f%%\n", propertyType, propertyConfidence * 100.0); + } + } + } + } +} +// [END documentai_process_quality_document] diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java new file mode 100644 index 00000000000..5cbb1af107c --- /dev/null +++ b/document-ai/src/main/java/documentai/v1beta3/ProcessSpecializedDocument.java @@ -0,0 +1,106 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_specialized_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessSpecializedDocument { + public static void processSpecializedDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processSpecializedDocument(projectId, location, processerId, filePath); + } + + public static void processSpecializedDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read fields specificly from the specalized US drivers license processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_us-driver-license-parser + // retriving data from other specalized processors follow a similar pattern. + // For a complete list of processors see: + // https://cloud.google.com/document-ai/docs/processors-list + // + // OCR and other data is also present in the quality processor's response. + // Please see the OCR and other samples for how to parse other data in the + // response. + for (Document.Entity entity : documentResponse.getEntitiesList()) { + // Fields detected. For a full list of fields for each processor see + // the processor documentation: + // https://cloud.google.com/document-ai/docs/processors-list + String entityType = entity.getType(); + // some other value formats in addition to text are availible + // e.g. dates: `entity.getNormalizedValue().getDateValue().getYear()` + // check for normilized value with `entity.hasNormalizedValue()` + String entityTextValue = escapeNewlines(entity.getTextAnchor().getContent()); + float entityConfidence = entity.getConfidence(); + System.out.printf( + " * %s: %s (%.2f%% confident)\n", + entityType, entityTextValue, entityConfidence * 100.0); + } + } + } + + private static String escapeNewlines(String s) { + return s.replace("\n", "\\n").replace("\r", "\\r"); + } +} +// [END documentai_process_specialized_document] diff --git a/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java b/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java new file mode 100644 index 00000000000..e63e2f8e4cf --- /dev/null +++ b/document-ai/src/main/java/documentai/v1beta3/ProcessSplitterDocument.java @@ -0,0 +1,112 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +// [START documentai_process_splitter_document] + +import com.google.cloud.documentai.v1beta3.Document; +import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient; +import com.google.cloud.documentai.v1beta3.ProcessRequest; +import com.google.cloud.documentai.v1beta3.ProcessResponse; +import com.google.cloud.documentai.v1beta3.RawDocument; +import com.google.protobuf.ByteString; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.List; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; + +public class ProcessSplitterDocument { + public static void processSplitterDocument() + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // TODO(developer): Replace these variables before running the sample. + String projectId = "your-project-id"; + String location = "your-project-location"; // Format is "us" or "eu". + String processerId = "your-processor-id"; + String filePath = "path/to/input/file.pdf"; + processSplitterDocument(projectId, location, processerId, filePath); + } + + public static void processSplitterDocument( + String projectId, String location, String processorId, String filePath) + throws IOException, InterruptedException, ExecutionException, TimeoutException { + // Initialize client that will be used to send requests. This client only needs to be created + // once, and can be reused for multiple requests. After completing all of your requests, call + // the "close" method on the client to safely clean up any remaining background resources. + try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + String name = + String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId); + + // Read the file. + byte[] imageFileData = Files.readAllBytes(Paths.get(filePath)); + + // Convert the image data to a Buffer and base64 encode it. + ByteString content = ByteString.copyFrom(imageFileData); + + RawDocument document = + RawDocument.newBuilder().setContent(content).setMimeType("application/pdf").build(); + + // Configure the process request. + ProcessRequest request = + ProcessRequest.newBuilder().setName(name).setRawDocument(document).build(); + + // Recognizes text entities in the PDF document + ProcessResponse result = client.processDocument(request); + Document documentResponse = result.getDocument(); + + System.out.println("Document processing complete."); + + // Read the splitter output from the document splitter processor: + // https://cloud.google.com/document-ai/docs/processors-list#processor_doc-splitter + // This processor only provides text for the document and information on how + // to split the document on logical boundaries. To identify and extract text, + // form elements, and entities please see other processors like the OCR, form, + // and specalized processors. + List entities = documentResponse.getEntitiesList(); + System.out.printf("Found %d subdocuments:\n", entities.size()); + for (Document.Entity entity : entities) { + float entityConfidence = entity.getConfidence(); + String pagesRangeText = pageRefsToString(entity.getPageAnchor().getPageRefsList()); + String subdocumentType = entity.getType(); + if (subdocumentType.isEmpty()) { + System.out.printf( + "%.2f%% confident that %s a subdocument.\n", entityConfidence * 100, pagesRangeText); + } else { + System.out.printf( + "%.2f%% confident that %s a '%s' subdocument.\n", + entityConfidence * 100, pagesRangeText, subdocumentType); + } + } + } + } + + // Converts page reference(s) to a string describing the page or page range. + private static String pageRefsToString(List pageRefs) { + if (pageRefs.size() == 1) { + return String.format("page %d is", pageRefs.get(0).getPage() + 1); + } else { + long start = pageRefs.get(0).getPage() + 1; + long end = pageRefs.get(1).getPage() + 1; + return String.format("pages %d to %d are", start, end); + } + } +} +// [END documentai_process_splitter_document] diff --git a/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java b/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java new file mode 100644 index 00000000000..1024ae71fc3 --- /dev/null +++ b/document-ai/src/test/java/documentai/v1/BatchProcessDocumentTest.java @@ -0,0 +1,116 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import com.google.api.gax.paging.Page; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.BucketInfo; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.UUID; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class BatchProcessDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "88541adc6eeec481"; + private static final String BUCKET_NAME = + String.format("document-ai-output-test-%s", UUID.randomUUID()); + private static final String INPUT_URI = "gs://cloud-samples-data/documentai/invoice.pdf"; + private static final String OUTPUT_PREFIX = String.format("%s", UUID.randomUUID()); + private static final String OUTPUT_BUCKET_NAME = PROJECT_ID; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + private static void cleanUpBucket() { + Storage storage = StorageOptions.getDefaultInstance().getService(); + Page blobs = + storage.list( + BUCKET_NAME, + Storage.BlobListOption.currentDirectory(), + Storage.BlobListOption.prefix(OUTPUT_PREFIX)); + + deleteDirectory(storage, blobs); + } + + private static void deleteDirectory(Storage storage, Page blobs) { + for (Blob blob : blobs.iterateAll()) { + if (!blob.delete()) { + Page subBlobs = + storage.list( + BUCKET_NAME, + Storage.BlobListOption.currentDirectory(), + Storage.BlobListOption.prefix(blob.getName())); + + deleteDirectory(storage, subBlobs); + } + } + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + + Storage storage = StorageOptions.getDefaultInstance().getService(); + storage.create(BucketInfo.of(BUCKET_NAME)); + } + + @Test + public void testBatchProcessDocument() + throws InterruptedException, ExecutionException, TimeoutException, IOException { + // parse the GCS invoice as a form. + BatchProcessDocument.batchProcessDocument( + PROJECT_ID, "us", PROCESSOR_ID, INPUT_URI, OUTPUT_BUCKET_NAME, OUTPUT_PREFIX); + String got = bout.toString(); + + assertThat(got).contains("Paragraph text:"); + assertThat(got).contains("Extracted"); + } + + @After + public void tearDown() { + cleanUpBucket(); + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java b/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java new file mode 100644 index 00000000000..6a4a35aa9eb --- /dev/null +++ b/document-ai/src/test/java/documentai/v1/ProcessDocumentTest.java @@ -0,0 +1,76 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "88541adc6eeec481"; + private static final String FILE_PATH = "resources/invoice.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessDocument.processDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Paragraph text:"); + assertThat(got).contains("Extracted"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1/QuickStartTest.java b/document-ai/src/test/java/documentai/v1/QuickStartTest.java new file mode 100644 index 00000000000..afaa4b1c7f1 --- /dev/null +++ b/document-ai/src/test/java/documentai/v1/QuickStartTest.java @@ -0,0 +1,75 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class QuickStartTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "88541adc6eeec481"; + private static final String FILE_PATH = "resources/invoice.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testQuickStart() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + QuickStart.quickStart(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Paragraph text:"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java new file mode 100644 index 00000000000..7491d7442e7 --- /dev/null +++ b/document-ai/src/test/java/documentai/v1beta3/ProcessFormDocumentTest.java @@ -0,0 +1,78 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessFormDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "88541adc6eeec481"; + private static final String FILE_PATH = "resources/invoice.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessFormDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessFormDocument.processFormDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("There are 1 page(s) in this document."); + assertThat(got).contains("Table with 4 columns and 6 rows"); + assertThat(got).contains("Found 13 form fields"); + assertThat(got).contains("'BALANCE DUE': '$2140.00'"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java new file mode 100644 index 00000000000..0c2da47156b --- /dev/null +++ b/document-ai/src/test/java/documentai/v1beta3/ProcessOcrDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessOcrDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "f9018d35bc5edc1e"; + private static final String FILE_PATH = "resources/handwritten_form.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessOcrDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessOcrDocument.processOcrDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Page 1"); + assertThat(got).contains("en"); + assertThat(got).contains("FakeDoc"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java new file mode 100644 index 00000000000..7379dbf0f30 --- /dev/null +++ b/document-ai/src/test/java/documentai/v1beta3/ProcessQualityDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessQualityDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "f80f55e03d4c20ed"; + private static final String FILE_PATH = "resources/document_quality_poor.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessQualityDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessQualityDocument.processQualityDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Page 1 has a quality score of"); + assertThat(got).contains("defect_blurry score of 9"); + assertThat(got).contains("defect_noisy"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java new file mode 100644 index 00000000000..5f5b21d078d --- /dev/null +++ b/document-ai/src/test/java/documentai/v1beta3/ProcessSpecializedDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessSpecializedDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "ae8bc99f01b36b5e"; + private static final String FILE_PATH = "resources/us_driver_license.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessSpecializedDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessSpecializedDocument.processSpecializedDocument( + PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Document Id"); + assertThat(got).contains("97551579"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +} diff --git a/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java b/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java new file mode 100644 index 00000000000..8fcf7aafb6f --- /dev/null +++ b/document-ai/src/test/java/documentai/v1beta3/ProcessSplitterDocumentTest.java @@ -0,0 +1,77 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package documentai.v1beta3; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class ProcessSplitterDocumentTest { + private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); + private static final String PROCESSOR_ID = "7cb010d65184a4d"; + private static final String FILE_PATH = "resources/multi_document.pdf"; + + private ByteArrayOutputStream bout; + private PrintStream out; + private PrintStream originalPrintStream; + + private static void requireEnvVar(String varName) { + assertNotNull( + String.format("Environment variable '%s' must be set to perform these tests.", varName), + System.getenv(varName)); + } + + @Before + public void checkRequirements() { + requireEnvVar("GOOGLE_CLOUD_PROJECT"); + requireEnvVar("GOOGLE_APPLICATION_CREDENTIALS"); + } + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + originalPrintStream = System.out; + System.setOut(out); + } + + @Test + public void testProcessSplitterDocument() + throws InterruptedException, ExecutionException, IOException, TimeoutException { + // parse the GCS invoice as a form. + ProcessSplitterDocument.processSplitterDocument(PROJECT_ID, "us", PROCESSOR_ID, FILE_PATH); + String got = bout.toString(); + + assertThat(got).contains("Found 8 subdocuments"); + assertThat(got).contains("confident that pages 1 to 2 are a subdocument"); + assertThat(got).contains("confident that page 10 is a subdocument"); + } + + @After + public void tearDown() { + System.out.flush(); + System.setOut(originalPrintStream); + } +}