Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 1.0.8-dev1

* Enhancement: Optimized cells_to_html (codeflash)

## 1.0.7

* Fix a hardcoded file extension causing confusion in the logs
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0.7" # pragma: no cover
__version__ = "1.0.8-dev1" # pragma: no cover
53 changes: 31 additions & 22 deletions unstructured_inference/models/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,25 +688,32 @@ def fill_cells(cells: List[dict]) -> List[dict]:
if not cells:
return []

table_rows_no = max({row for cell in cells for row in cell["row_nums"]})
table_cols_no = max({col for cell in cells for col in cell["column_nums"]})
filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool)
# Find max row and col indices
max_row = max(row for cell in cells for row in cell["row_nums"])
max_col = max(col for cell in cells for col in cell["column_nums"])
filled = set()
for cell in cells:
for row in cell["row_nums"]:
for col in cell["column_nums"]:
filled[row, col] = True
# add cells for which filled is false
header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]}
filled.add((row, col))
header_rows = set()
for cell in cells:
if cell["column header"]:
header_rows.update(cell["row_nums"])

# Compose output list directly for speed
new_cells = cells.copy()
not_filled_idx = np.where(filled == False) # noqa: E712
for row, col in zip(not_filled_idx[0], not_filled_idx[1]):
new_cell = {
"row_nums": [row],
"column_nums": [col],
"cell text": "",
"column header": row in header_rows,
}
new_cells.append(new_cell)
for row in range(max_row + 1):
for col in range(max_col + 1):
if (row, col) not in filled:
new_cells.append(
{
"row_nums": [row],
"column_nums": [col],
"cell text": "",
"column header": row in header_rows,
}
)
return new_cells


Expand All @@ -725,18 +732,20 @@ def cells_to_html(cells: List[dict]) -> str:
Returns:
str: HTML table string
"""
cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))
# Pre-sort with tuple key, as per original
cells_filled = fill_cells(cells)
cells_sorted = sorted(cells_filled, key=lambda k: (min(k["row_nums"]), min(k["column_nums"])))

table = ET.Element("table")
current_row = -1

table_header = None
table_has_header = any(cell["column header"] for cell in cells)
if table_has_header:
table_header = ET.SubElement(table, "thead")

# Check if any column header exists
table_has_header = any(cell["column header"] for cell in cells_sorted)
table_header = ET.SubElement(table, "thead") if table_has_header else None
table_body = ET.SubElement(table, "tbody")
for cell in cells:

row = None
for cell in cells_sorted:
this_row = min(cell["row_nums"])
attrib = {}
colspan = len(cell["column_nums"])
Expand Down