Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 1.0.8-dev0
## 1.0.8-dev1

* Enhancement: Optimized `cells_to_html` for an 8% speedup in some cases (codeflash)
* Enhancement: Optimized `outputs_to_objects` for an 88% speedup in some cases (codeflash)

## 1.0.7
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.8-dev0" # pragma: no cover
__version__ = "1.0.8-dev1" # pragma: no cover
58 changes: 34 additions & 24 deletions unstructured_inference/models/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,25 +690,32 @@ def fill_cells(cells: List[dict]) -> List[dict]:
if not cells:
return []

table_rows_no = max({row for cell in cells for row in cell["row_nums"]})
table_cols_no = max({col for cell in cells for col in cell["column_nums"]})
filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool)
# Find max row and col indices
max_row = max(row for cell in cells for row in cell["row_nums"])
max_col = max(col for cell in cells for col in cell["column_nums"])
filled = set()
for cell in cells:
for row in cell["row_nums"]:
for col in cell["column_nums"]:
filled[row, col] = True
# add cells for which filled is false
header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]}
filled.add((row, col))
header_rows = set()
for cell in cells:
if cell["column header"]:
header_rows.update(cell["row_nums"])

# Compose output list directly for speed
new_cells = cells.copy()
not_filled_idx = np.where(filled == False) # noqa: E712
for row, col in zip(not_filled_idx[0], not_filled_idx[1]):
new_cell = {
"row_nums": [row],
"column_nums": [col],
"cell text": "",
"column header": row in header_rows,
}
new_cells.append(new_cell)
for row in range(max_row + 1):
for col in range(max_col + 1):
if (row, col) not in filled:
new_cells.append(
{
"row_nums": [row],
"column_nums": [col],
"cell text": "",
"column header": row in header_rows,
}
)
return new_cells


Expand All @@ -727,18 +734,20 @@ def cells_to_html(cells: List[dict]) -> str:
Returns:
str: HTML table string
"""
cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
# Pre-sort with tuple key, as per original
cells_filled = fill_cells(cells)
cells_sorted = sorted(cells_filled, key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))

table = ET.Element("table")
current_row = -1

table_header = None
table_has_header = any(cell["column header"] for cell in cells)
if table_has_header:
table_header = ET.SubElement(table, "thead")

# Check if any column header exists
table_has_header = any(cell["column header"] for cell in cells_sorted)
table_header = ET.SubElement(table, "thead") if table_has_header else None
table_body = ET.SubElement(table, "tbody")
for cell in cells:

row = None
for cell in cells_sorted:
this_row = min(cell["row_nums"])
attrib = {}
colspan = len(cell["column_nums"])
Expand All @@ -756,8 +765,9 @@ def cells_to_html(cells: List[dict]) -> str:
table_subelement = table_body
cell_tag = "td"
row = ET.SubElement(table_subelement, "tr") # type: ignore
tcell = ET.SubElement(row, cell_tag, attrib=attrib)
tcell.text = cell["cell text"]
if row is not None:
tcell = ET.SubElement(row, cell_tag, attrib=attrib)
tcell.text = cell["cell text"]

return str(ET.tostring(table, encoding="unicode", short_empty_elements=False))

Expand Down