|
| 1 | +# coding: utf-8 |
| 2 | + |
| 3 | +# ------------------------------------------------------------------------- |
| 4 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 5 | +# Licensed under the MIT License. See License.txt in the project root for |
| 6 | +# license information. |
| 7 | +# -------------------------------------------------------------------------- |
| 8 | + |
| 9 | +""" |
| 10 | +FILE: sample_get_elements_with_spans_async.py |
| 11 | +
|
| 12 | +DESCRIPTION: |
| 13 | + This sample demonstrates how to get elements that are contained in the spans of another element. |
| 14 | + In this sample, the examples attempt to find the lines and styles that have the same spans as the |
| 15 | + main search element. The purpose of this sample is to show how to search for document elements |
| 16 | + that are within the same span area as other elements. |
| 17 | +
|
| 18 | +USAGE: |
| 19 | + python sample_get_elements_with_spans_async.py |
| 20 | +
|
| 21 | + Set the environment variables with your own values before running the sample: |
| 22 | + 1) AZURE_FORM_RECOGNIZER_ENDPOINT - the endpoint to your Cognitive Services resource. |
| 23 | + 2) AZURE_FORM_RECOGNIZER_KEY - your Form Recognizer API key |
| 24 | +""" |
| 25 | + |
| 26 | +import os |
| 27 | +import asyncio |
| 28 | + |
| 29 | +def get_styles(element_spans, styles): |
| 30 | + result = [] |
| 31 | + for span in element_spans: |
| 32 | + for style in styles: |
| 33 | + for style_span in style.spans: |
| 34 | + if style_span.offset >= span.offset and ( |
| 35 | + style_span.offset + style_span.length |
| 36 | + ) <= (span.offset + span.length): |
| 37 | + result.append(style) |
| 38 | + return result |
| 39 | + |
| 40 | +def get_lines(element_spans, document_page): |
| 41 | + result = [] |
| 42 | + for span in element_spans: |
| 43 | + for line in document_page.lines: |
| 44 | + for line_span in line.spans: |
| 45 | + if line_span.offset >= span.offset and ( |
| 46 | + line_span.offset + line_span.length |
| 47 | + ) <= (span.offset + span.length): |
| 48 | + result.append(line) |
| 49 | + return result |
| 50 | + |
| 51 | +def get_page(page_number, pages): |
| 52 | + for page in pages: |
| 53 | + if page.page_number == page_number: |
| 54 | + return page |
| 55 | + raise ValueError("could not find the requested page") |
| 56 | + |
| 57 | +async def get_elements_with_spans_async(): |
| 58 | + path_to_sample_documents = os.path.abspath( |
| 59 | + os.path.join( |
| 60 | + os.path.abspath(__file__), |
| 61 | + "..", |
| 62 | + "..", |
| 63 | + "..", |
| 64 | + "./sample_forms/forms/Form_1.jpg", |
| 65 | + ) |
| 66 | + ) |
| 67 | + |
| 68 | + from azure.core.credentials import AzureKeyCredential |
| 69 | + from azure.ai.formrecognizer.aio import DocumentAnalysisClient |
| 70 | + |
| 71 | + endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"] |
| 72 | + key = os.environ["AZURE_FORM_RECOGNIZER_KEY"] |
| 73 | + |
| 74 | + document_analysis_client = DocumentAnalysisClient( |
| 75 | + endpoint=endpoint, credential=AzureKeyCredential(key) |
| 76 | + ) |
| 77 | + async with document_analysis_client: |
| 78 | + with open(path_to_sample_documents, "rb") as f: |
| 79 | + poller = await document_analysis_client.begin_analyze_document( |
| 80 | + "prebuilt-document", document=f |
| 81 | + ) |
| 82 | + result = await poller.result() |
| 83 | + |
| 84 | + # Below is a method to search for the lines of a particular element by using spans. |
| 85 | + # This example uses DocumentTable, but other elements that also have a `spans` or `span` field |
| 86 | + # can also be used to search for related elements, such as lines in this case. |
| 87 | + # To see an example for searching for words which have a `span` field, see |
| 88 | + # `sample_get_words_on_document_line.py` under the samples v3.2-beta directory. |
| 89 | + for table_idx, table in enumerate(result.tables): |
| 90 | + print( |
| 91 | + "Table # {} has {} rows and {} columns".format( |
| 92 | + table_idx, table.row_count, table.column_count |
| 93 | + ) |
| 94 | + ) |
| 95 | + |
| 96 | + lines = [] |
| 97 | + |
| 98 | + for region in table.bounding_regions: |
| 99 | + print( |
| 100 | + "Table # {} location on page: {}".format( |
| 101 | + table_idx, |
| 102 | + region.page_number, |
| 103 | + ) |
| 104 | + ) |
| 105 | + lines.extend(get_lines(table.spans, get_page(region.page_number, result.pages))) |
| 106 | + |
| 107 | + print("Found # {} lines in the table".format(len(lines))) |
| 108 | + for line in lines: |
| 109 | + print( |
| 110 | + "...Line '{}' is within bounding box: '{}'".format( |
| 111 | + line.content, |
| 112 | + line.bounding_box, |
| 113 | + ) |
| 114 | + ) |
| 115 | + |
| 116 | + # Below is a method to search for the style of a particular element by using spans. |
| 117 | + # This example uses DocumentEntity, but other elements that also have a `spans` or `span` |
| 118 | + # field can also be used to search for document text style. |
| 119 | + for entity in result.entities: |
| 120 | + styles = get_styles(entity.spans, result.styles) |
| 121 | + print( |
| 122 | + "Found entity '{}' of type '{}' with style:".format( |
| 123 | + entity.content, entity.category, |
| 124 | + ) |
| 125 | + ) |
| 126 | + if not styles: |
| 127 | + print( |
| 128 | + "...no handwritten text found" |
| 129 | + ) |
| 130 | + for style in styles: |
| 131 | + if style.is_handwritten: |
| 132 | + print( |
| 133 | + "...handwritten with confidence {}".format(style.confidence) |
| 134 | + ) |
| 135 | + print("----------------------------------------") |
| 136 | + |
| 137 | + |
| 138 | +async def main(): |
| 139 | + await get_elements_with_spans_async() |
| 140 | + |
| 141 | + |
| 142 | +if __name__ == '__main__': |
| 143 | + loop = asyncio.get_event_loop() |
| 144 | + loop.run_until_complete(main()) |
0 commit comments