Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions docs/source/notebooks/table_data_to_various_formats.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def read_requirements(path):
setup(
# include data files
name="amazon-textract-textractor",
version="1.1.1",
version="1.2.0",
license="Apache 2.0",
description="A package to use AWS Textract services.",
url="https://github.com/aws-samples/amazon-textract-textractor",
Expand Down
Binary file added tests/fixtures/paystub.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 26 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,32 @@ def test_table(self):
self.assertEqual(cell.merge_direction(), (None, "None"))
self.assertEqual(cell.__repr__(), "<Cell: (1,1), Span: (1, 1), Column Header: True, MergedCell: False> Cell 1")

def test_table_with_title_and_footers(self):
# Insert credentials here to run test
profile_name = "default"
current_directory = os.path.abspath(os.path.dirname(__file__))

if profile_name is None:
raise InvalidProfileNameError(
"Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
)

if os.environ.get("CALL_TEXTRACT"):
extractor = Textractor(
profile_name=profile_name, kms_key_id=""
)
document = extractor.analyze_document(
file_source=os.path.join(current_directory, "fixtures/paystub.jpg"),
features=[TextractFeatures.TABLES],
save_image=False,
)
else:
document = Document.open(get_fixture_path())

self.assertEqual(len(document.tables), 7)
self.assertNotEqual(document.tables[3].title, None)
self.assertNotEqual(len(document.tables[4].footers), 1)

if __name__ == "__main__":
test = TestTable()
test.setUp()
Expand Down
13 changes: 13 additions & 0 deletions textractor/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
KEY_VALUE_SET = "KEY_VALUE_SET"
SELECTION_ELEMENT = "SELECTION_ELEMENT"
TABLE = "TABLE"
TABLE_TITLE = "TABLE_TITLE"
TABLE_FOOTER = "TABLE_FOOTER"
TABLE_SUMMARY = "TABLE_SUMMARY"
TABLE_SECTION_TITLE = "TABLE_SECTION_TITLE"
TABLE_COLUMN_HEADER = "COLUMN_HEADER"
TABLE_STRUCTURED = "STRUCTURED"
TABLE_SEMI_STRUCTURED = "SEMI_STRUCTURED"
CELL = "CELL"
PAGE = "PAGE"
MERGED_CELL = "MERGED_CELL"
Expand All @@ -30,6 +37,12 @@
HANDWRITING = "HANDWRITING"
PRINTED = "PRINTED"

class TableTypes(Enum):
"""Types of tables recognized by Textract APIs."""

UNKNOWN = 0
STRUCTURED = 1
SEMI_STRUCTURED = 2

class Direction(Enum):
"""Directions available for search using DirectionalFinder"""
Expand Down
67 changes: 66 additions & 1 deletion textractor/entities/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
from textractor.visualizers.entitylist import EntityList
from textractor.entities.document_entity import DocumentEntity
from textractor.entities.selection_element import SelectionElement
from textractor.entities.table_title import TableTitle
from textractor.entities.table_footer import TableFooter
from textractor.utils.geometry_util import get_indices
from textractor.data.constants import SimilarityMetric, TextTypes, CellTypes
from textractor.data.constants import SimilarityMetric, TextTypes, CellTypes, TableTypes
from textractor.data.constants import (
IS_COLUMN_HEAD,
IS_MERGED_CELL,
Expand All @@ -41,6 +43,9 @@ def __init__(self, entity_id, bbox: BoundingBox):
super().__init__(entity_id, bbox)
self.table_cells: List[TableCell] = []
self.column_headers: Dict[str, List[TableCell]] = {}
self._title: TableTitle = None
self._footers: TableFooter = []
self._table_type: TableTypes = TableTypes.UNKNOWN
self._page = None
self._page_id = None

Expand Down Expand Up @@ -80,6 +85,66 @@ def page(self, page_num: int):

self._page = page_num

@property
def table_type(self):
"""
:return: Returns the table type.
:rtype: TableTypes
"""

return self._table_type

@table_type.setter
def table_type(self, table_type: TableTypes):
"""
Sets the table type attribute of the Table entity.

:param title: Type of the Table entity.
:type title: TableTypes
"""

self._table_type = table_type

@property
def title(self):
"""
:return: Returns the table title.
:rtype: TableTitle
"""

return self._title

@title.setter
def title(self, title: TableTitle):
"""
Sets the table title attribute of the Table entity.

:param title: Title of the Table entity.
:type title: TableTitle
"""

self._title = title

@property
def footers(self):
"""
:return: Returns the table footers.
:rtype: List[TableFooter]
"""

return self._footers

@footers.setter
def footers(self, footers: List[TableFooter]):
"""
Sets the footers attribute of the Table entity.

:param footers: Footers of the Table entity.
:type footers: List[TableFooter]
"""

self._footers = footers

@property
def page_id(self) -> str:
"""
Expand Down
33 changes: 31 additions & 2 deletions textractor/entities/table_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,13 @@ class TableCell(DocumentEntity):
:param row_index: Row index of position of cell within the table
:param col_index: Column index of position of cell within the table
:param row_span: How many merged cells does the cell spans horizontally (1 means no merged cells)
:param col_span: How mant merged cells does the cell spand vertically (1 means no merged cells)
:param col_span: How many merged cells does the cell spand vertically (1 means no merged cells)
:param confidence: Confidence out of 100 with which the Cell was detected.
:param is_column_header: Indicates if the cell is a column header
:param is_title: Indicates if the cell is a table title
:param is_footer: Indicates if the cell is a table footer
:param is_summary: Indicates if the cell is a summary cell
:param is_section_title: Indicates if the cell is a section title
"""

def __init__(
Expand All @@ -51,7 +56,11 @@ def __init__(
row_span: int,
col_span: int,
confidence: float = 0,
is_column_header: bool = False
is_column_header: bool = False,
is_title: bool = False,
is_footer: bool = False,
is_summary: bool = False,
is_section_title: bool = False
):

super().__init__(entity_id, bbox)
Expand All @@ -64,6 +73,10 @@ def __init__(
self._page = None
self._page_id = None
self._is_column_header = is_column_header
self._is_title = is_title
self._is_footer = is_footer
self._is_summary = is_summary
self._is_section_title = is_section_title
# this gets populated when cells are added to a table using the `add_cells` method
# or when cells are attributed to a table with table.cells = [TableCell]
self._parent_table_id = None
Expand All @@ -74,6 +87,22 @@ def __init__(
def is_column_header(self):
return self._is_column_header

@property
def is_title(self):
return self._is_title

@property
def is_footer(self):
return self._is_footer

@property
def is_summary(self):
return self._is_summary

@property
def is_section_title(self):
return self._is_section_title

@property
def page(self):
"""
Expand Down
99 changes: 99 additions & 0 deletions textractor/entities/table_footer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Represents a single :class:`TableFooter:class:` object. The `TableCell:class:` object contains information such as:

* The position of the footer within the Document
* The words that it contains
* Confidence of entity detection
"""

from typing import List
from textractor.entities.bbox import BoundingBox
from textractor.entities.document_entity import DocumentEntity
from textractor.entities.word import Word
from textractor.visualizers.entitylist import EntityList


class TableFooter(DocumentEntity):
"""
Represents a footer that is either in-table or floating
"""

def __init__(
self,
entity_id: str,
bbox: BoundingBox,
):
super().__init__(entity_id, bbox)
self._words: List[Word] = []
self._is_floating: bool = False
self._page = None
self._page_id = None

@property
def words(self):
"""
Returns all the Word objects present in the :class:`TableFooter`.

:return words: List of Word objects, each representing a word within the TableFooter.
:rtype: list
"""
return EntityList(self._words)

@words.setter
def words(self, words: List[Word]):
"""
Add Word objects to the :class:`TableFooter`.

:param words: List of Word objects, each representing a word within the TableFooter. No specific ordering is assumed as it is ordered internally.
:type words: list
"""
self._words = words

@property
def text(self) -> str:
"""Returns the text in the footer as one space-separated string

:return: Text in the footer
:rtype: str
"""
return " ".join([w.text for w in self.words])

@property
def page(self):
"""
:return: Returns the page number of the page the TableFooter entity is present in.
:rtype: int
"""

return self._page

@page.setter
def page(self, page_num: int):
"""
Sets the page number attribute of the TableFooter entity.

:param page_num: Page number where the TableFooter entity exists.
:type page_num: int
"""

self._page = page_num

@property
def page_id(self) -> str:
"""
:return: Returns the Page ID attribute of the page which the entity belongs to.
:rtype: str
"""

return self._page_id

@page_id.setter
def page_id(self, page_id: str):
"""
Sets the Page ID of the TableFooter entity.

:param page_id: Page ID of the page the entity belongs to.
:type page_id: str
"""

self._page_id = page_id
Loading