Skip to content

Commit 9cc07cc

Browse files
authored
fix: Add support for Classifier entities (#333)
- Added support for Custom Classifier entities - Added Unit Tests for Classifier output - Added input validation for `Document.split_pdf()` Fixes #332 🦕
1 parent 98e7bc1 commit 9cc07cc

File tree

5 files changed

+84
-18
lines changed

5 files changed

+84
-18
lines changed

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ def entities_to_bigquery(
765765
)
766766

767767
def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
768-
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
768+
r"""Splits local PDF file into multiple PDF files based on output from a Splitter processor.
769769
770770
Args:
771771
pdf_path (str):
@@ -776,6 +776,8 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
776776
List[str]:
777777
A list of output pdf files.
778778
"""
779+
if self.entities[0].start_page is None or self.entities[0].end_page is None:
780+
raise ValueError("Entities do not contain start or end pages.")
779781
output_files: List[str] = []
780782
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
781783
with Pdf.open(pdf_path) as pdf:

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,45 +40,46 @@ class Entity:
4040
Required. Entity type from a schema e.g. "Address".
4141
mention_text (str):
4242
Optional. Text value in the document e.g. "1600 Amphitheatre Pkwy".
43-
If the entity is not present in
44-
the document, this field will be empty.
43+
Only populated for Extraction processors.
4544
normalized_text (str):
4645
Optional. Normalized text value in the document e.g. "1970-01-01".
47-
If the entity is not present in
48-
the document, this field will be empty.
46+
Only populated for Extraction processors.
4947
start_page (int):
50-
Required. `Page` containing the `Entity` or the first page of the
51-
classification (for Splitter/Classifier processors).
48+
Optional. `Page` containing the `Entity` for Extraction processors or the first page of the
49+
subdocument for Splitter processors.
5250
end_page (int):
53-
Required. Last page of the classification
51+
Optional. Last page of the subdocument for Splitter processors.
5452
"""
5553

5654
documentai_object: documentai.Document.Entity = dataclasses.field(repr=False)
5755
page_offset: dataclasses.InitVar[Optional[int]] = 0
5856

5957
type_: str = dataclasses.field(init=False)
60-
mention_text: str = dataclasses.field(init=False, default="")
61-
normalized_text: str = dataclasses.field(init=False, default="")
58+
mention_text: Optional[str] = dataclasses.field(init=False, default=None)
59+
normalized_text: Optional[str] = dataclasses.field(init=False, default=None)
6260

63-
start_page: int = dataclasses.field(init=False)
64-
# Only Populated for Splitter/Classifier Output
65-
end_page: int = dataclasses.field(init=False)
61+
start_page: Optional[int] = dataclasses.field(init=False, default=None)
62+
end_page: Optional[int] = dataclasses.field(init=False, default=None)
6663

6764
_image: Optional[Image.Image] = dataclasses.field(init=False, default=None)
6865

6966
def __post_init__(self, page_offset: int) -> None:
7067
self.type_ = self.documentai_object.type_
71-
self.mention_text = self.documentai_object.mention_text
68+
69+
if self.documentai_object.mention_text:
70+
self.mention_text = self.documentai_object.mention_text
71+
7272
if (
7373
self.documentai_object.normalized_value
7474
and self.documentai_object.normalized_value.text
7575
):
7676
self.normalized_text = self.documentai_object.normalized_value.text
7777

78-
page_refs = self.documentai_object.page_anchor.page_refs
79-
if page_refs:
80-
self.start_page = int(page_refs[0].page) + page_offset
81-
self.end_page = int(page_refs[-1].page) + page_offset
78+
if self.documentai_object.page_anchor:
79+
page_refs = self.documentai_object.page_anchor.page_refs
80+
if page_refs:
81+
self.start_page = int(page_refs[0].page) + page_offset
82+
self.end_page = int(page_refs[-1].page) + page_offset
8283

8384
def crop_image(
8485
self, documentai_page: documentai.Document.Page
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n(*)\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 * 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]}

packages/google-cloud-documentai-toolbox/tests/unit/test_document.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,13 @@ def get_bytes_splitter_mock():
7979
yield byte_factory
8080

8181

82+
@pytest.fixture
83+
def get_bytes_classifier_mock():
84+
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
85+
byte_factory.return_value = get_bytes("tests/unit/resources/classifier")
86+
yield byte_factory
87+
88+
8289
@pytest.fixture
8390
def get_bytes_images_mock():
8491
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
@@ -206,6 +213,30 @@ def test_entities_from_shards_with_hex_ids():
206213
assert actual[1].type_ == "class_international"
207214

208215

216+
def test_entities_from_shards_classifier(get_bytes_classifier_mock):
217+
shards = document._get_shards(
218+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"
219+
)
220+
get_bytes_classifier_mock.assert_called_once()
221+
222+
actual = document._entities_from_shards(shards=shards)
223+
224+
# Check for error reported in https://github.com/googleapis/python-documentai-toolbox/issues/332
225+
assert repr(actual)
226+
assert actual[0].type_ == "computer_vision"
227+
assert round(actual[0].documentai_object.confidence, 8) == 0.47925246
228+
assert actual[0].documentai_object.id == "0"
229+
assert actual[1].type_ == "crypto"
230+
assert round(actual[1].documentai_object.confidence, 8) == 0.0433604
231+
assert actual[1].documentai_object.id == "1"
232+
assert actual[2].type_ == "med_tech"
233+
assert round(actual[2].documentai_object.confidence, 8) == 0.26732057
234+
assert actual[2].documentai_object.id == "2"
235+
assert actual[3].type_ == "other"
236+
assert round(actual[3].documentai_object.confidence, 8) == 0.2100666
237+
assert actual[3].documentai_object.id == "3"
238+
239+
209240
@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
210241
def test_get_batch_process_metadata_with_valid_operation(
211242
mock_docai,
@@ -703,6 +734,22 @@ def test_split_pdf(mock_Pdf, get_bytes_splitter_mock):
703734
]
704735

705736

737+
def test_split_pdf_with_non_splitter(get_bytes_classifier_mock):
738+
doc = document.Document.from_gcs(
739+
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
740+
)
741+
742+
with pytest.raises(
743+
ValueError,
744+
match="Entities do not contain start or end pages.",
745+
):
746+
doc.split_pdf(
747+
pdf_path="procurement_multi_document.pdf", output_path="splitter/output/"
748+
)
749+
750+
get_bytes_classifier_mock.assert_called_once()
751+
752+
706753
def test_convert_document_to_annotate_file_response():
707754
doc = document.Document.from_document_path(
708755
document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"

packages/google-cloud-documentai-toolbox/tests/unit/test_entity.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,21 @@ def test_Entity_splitter():
6868
assert wrapper_entity.end_page == 2
6969

7070

71+
def test_Entity_classifier():
72+
documentai_entity = documentai.Document.Entity(
73+
type_="clinical_notes",
74+
id="0",
75+
confidence=0.99878639,
76+
)
77+
wrapper_entity = entity.Entity(documentai_entity)
78+
assert wrapper_entity.type_ == "clinical_notes"
79+
assert wrapper_entity.documentai_object.id == "0"
80+
assert round(wrapper_entity.documentai_object.confidence, 8) == 0.99878639
81+
assert not wrapper_entity.mention_text
82+
assert not wrapper_entity.start_page
83+
assert not wrapper_entity.end_page
84+
85+
7186
def test_Entity_with_page_offset():
7287
documentai_entity = documentai.Document.Entity(
7388
type_="invoice_statement",

0 commit comments

Comments
 (0)