diff --git a/CHANGELOG.md b/CHANGELOG.md index b1375aad..84078cac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.0.8-dev1 + +* Enhancement: Optimized cells_to_html (codeflash) + ## 1.0.7 * Fix a hardcoded file extension causing confusion in the logs diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 1afb74e9..cbbb1304 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "1.0.7" # pragma: no cover +__version__ = "1.0.8-dev1" # pragma: no cover diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index c994207b..a9bc8269 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -688,25 +688,32 @@ def fill_cells(cells: List[dict]) -> List[dict]: if not cells: return [] - table_rows_no = max({row for cell in cells for row in cell["row_nums"]}) - table_cols_no = max({col for cell in cells for col in cell["column_nums"]}) - filled = np.zeros((table_rows_no + 1, table_cols_no + 1), dtype=bool) + # Find max row and col indices + max_row = max(row for cell in cells for row in cell["row_nums"]) + max_col = max(col for cell in cells for col in cell["column_nums"]) + filled = set() for cell in cells: for row in cell["row_nums"]: for col in cell["column_nums"]: - filled[row, col] = True - # add cells for which filled is false - header_rows = {row for cell in cells if cell["column header"] for row in cell["row_nums"]} + filled.add((row, col)) + header_rows = set() + for cell in cells: + if cell["column header"]: + header_rows.update(cell["row_nums"]) + + # Compose output list directly for speed new_cells = cells.copy() - not_filled_idx = np.where(filled == False) # noqa: E712 - for row, col in zip(not_filled_idx[0], not_filled_idx[1]): - new_cell = { - "row_nums": [row], - "column_nums": [col], - "cell text": "", - "column header": row in header_rows, - } - new_cells.append(new_cell) + for row in range(max_row + 1): + for col in range(max_col + 1): + if (row, col) not in filled: + new_cells.append( + { + "row_nums": [row], + "column_nums": [col], + "cell text": "", + "column header": row in header_rows, + } + ) return new_cells @@ -725,18 +732,20 @@ def cells_to_html(cells: List[dict]) -> str: Returns: str: HTML table string """ - cells = sorted(fill_cells(cells), key=lambda k: (min(k["row_nums"]), min(k["column_nums"]))) + # Pre-sort with tuple key, as per original + cells_filled = fill_cells(cells) + cells_sorted = sorted(cells_filled, key=lambda k: (min(k["row_nums"]), min(k["column_nums"]))) table = ET.Element("table") current_row = -1 - table_header = None - table_has_header = any(cell["column header"] for cell in cells) - if table_has_header: - table_header = ET.SubElement(table, "thead") - + # Check if any column header exists + table_has_header = any(cell["column header"] for cell in cells_sorted) + table_header = ET.SubElement(table, "thead") if table_has_header else None table_body = ET.SubElement(table, "tbody") - for cell in cells: + + row = None + for cell in cells_sorted: this_row = min(cell["row_nums"]) attrib = {} colspan = len(cell["column_nums"])