diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..ea32805c4afcd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -11,9 +11,26 @@ including other versions of pandas. .. --------------------------------------------------------------------------- .. _whatsnew_300.enhancements: + + Enhancements ~~~~~~~~~~~~ +.. _whatsnew_300.enhancements.read_excel_table_parameter: + +``Addition of table name parameter in pandas read_excel`` +^^^^^^^^^^^^^^^^ +Previously, when using pandas :func:`read_excel`` method the data read from Excel could not be selected in more detail than on the sheet level. +To distinguish data that is part of a particular table in the Excel file could be tedious without the use of third party API's. +This enhancement is a solution to `Issue #38937 `__ introduces the ability to specify a table_name parameter that corresponds to the name of a table in the specified Excel file. +The table_name argument accepts the names as a string, a list of strings, or the value of None which corresponds to reading in every table in that file. + +The return format has some changes, but only if a table_name parameter is specified. If it is not specified, the return remains a DataFrame or dictionary of DataFrames +holding the data from each sheet. + - If a table_name is specified and a sheet_name is not, the return will be a DataFrame or dictionary of DataFrames holding the table data desired + - If a table_name is specified and a sheet_name is also specified, the return with be a nested dictionary containing 2 dictionaries: + - The first dictionary is a dictionary of DataFrames corresponding to the data on each sheet + - The second dictionary is a dictionary of DataFrames corresponding to the data in each table .. _whatsnew_300.enhancements.enhancement1: enhancement1 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2b35cfa044ae9..bd6223c0e082e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -80,6 +80,7 @@ HashableT, IntStrT, ReadBuffer, + Scalar, Self, SequenceNotStr, StorageOptions, @@ -118,12 +119,32 @@ Available cases: * Defaults to ``0``: 1st sheet as a `DataFrame` + If a table name is specified and a sheet name is not (so it defaults + to 0), no sheets will be loaded. * ``1``: 2nd sheet as a `DataFrame` * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` * ``None``: All worksheets. +table_name : str, list of str, or None, default 0 + Strings are used for table_names that correspond to Excel Table names. + Lists of strings are used to request multiple tables. + Specify ``None`` to get all tables. + + Available cases: + + * Defaults to ``0``: No tables are read or returned + * ``Table1``: Load table with name "Table1", returned as a DataFrame + * ``["Table1", "Table2", "Table3"]``: Load the tables with names "Table1", + "Table2", and "Table3". Returned as a dictionary of DataFrames + * ``sheet_name="Sheet1", table_name="Table1":`` Load both the sheet with + name "Sheet1" and the table with name "Table1". Returned as a nested + dictionary, containing a "sheets" dictionary and a "tables" dictionary. + Each of these 2 dictionaries hold DataFrames of their respective data. + This is the same for if a list of either or both of these parameters + are specified. + header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed DataFrame. If a list of integers is passed those row positions will @@ -297,9 +318,10 @@ Returns ------- -DataFrame or dict of DataFrames +DataFrame, dict of DataFrames, or nested dictionary containing 2 dicts of DataFrames DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a dict of DataFrames is returned. + argument for more information on when a dict of DataFrames is returned, + and table_name for when a nested dictionary is returned. See Also -------- @@ -377,6 +399,9 @@ def read_excel( # sheet name is str or int -> DataFrame sheet_name: str | int = ..., *, + # table name is str -> DataFrame + # If sheet name and table name are specified -> Nested Dictionary of DataFrames + table_name: str = ..., header: int | Sequence[int] | None = ..., names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | str | Sequence[int] | None = ..., @@ -406,7 +431,7 @@ def read_excel( skipfooter: int = ..., storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> DataFrame: ... +) -> DataFrame | list[DataFrame] | dict[str, DataFrame]: ... @overload @@ -415,6 +440,9 @@ def read_excel( # sheet name is list or None -> dict[IntStrT, DataFrame] sheet_name: list[IntStrT] | None, *, + # table name is list[str] -> DataFrame + # If sheet name and table name are specified -> Nested Dictionary of DataFrames + table_name: list[str] | None, header: int | Sequence[int] | None = ..., names: SequenceNotStr[Hashable] | range | None = ..., index_col: int | str | Sequence[int] | None = ..., @@ -444,7 +472,7 @@ def read_excel( skipfooter: int = ..., storage_options: StorageOptions = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., -) -> dict[IntStrT, DataFrame]: ... +) -> DataFrame | dict[IntStrT, DataFrame] | dict[str, DataFrame]: ... @doc(storage_options=_shared_docs["storage_options"]) @@ -453,6 +481,8 @@ def read_excel( io, sheet_name: str | int | list[IntStrT] | None = 0, *, + # If sheet name and table name are specified -> Nested Dictionary of DataFrames + table_name: str | int | list[str] | None = 0, header: int | Sequence[int] | None = 0, names: SequenceNotStr[Hashable] | range | None = None, index_col: int | str | Sequence[int] | None = None, @@ -483,12 +513,16 @@ def read_excel( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, engine_kwargs: dict | None = None, -) -> DataFrame | dict[IntStrT, DataFrame]: +) -> DataFrame | list[DataFrame] | dict[str, DataFrame]: check_dtype_backend(dtype_backend) should_close = False if engine_kwargs is None: - engine_kwargs = {} - + if table_name == 0: + # The only time table_name will have a value of 0 is when it's not specified + engine_kwargs = {} + else: + # To read in table data the file cannot be read only + engine_kwargs = {"read_only": False} if not isinstance(io, ExcelFile): should_close = True io = ExcelFile( @@ -506,6 +540,7 @@ def read_excel( try: data = io.parse( sheet_name=sheet_name, + table_name=table_name, header=header, names=names, index_col=index_col, @@ -550,7 +585,6 @@ def __init__( ) -> None: if engine_kwargs is None: engine_kwargs = {} - self.handles = IOHandles( handle=filepath_or_buffer, compression={"method": None} ) @@ -714,6 +748,7 @@ def f(skiprows: Sequence, x: int) -> bool: def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, + table_name: str | int | list[str] | None = 0, header: int | Sequence[int] | None = 0, names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, @@ -753,12 +788,29 @@ def parse( else: sheets = [sheet_name] + tables: list[str] | None + if isinstance(table_name, int): + tables = None + if table_name != 0: + raise NotImplementedError + elif isinstance(table_name, list): + tables = table_name + ret_dict = True + elif table_name is None: + tables = self.table_names + print(self.table_names) + ret_dict = True + else: + tables = [table_name] + # handle same-type duplicates. sheets = cast(Union[list[int], list[str]], list(dict.fromkeys(sheets).keys())) - output = {} + output = {"sheets": {}, "tables": {}} + outputDict = None last_sheetname = None + outputDict = "sheets" for asheetname in sheets: last_sheetname = asheetname if verbose: @@ -777,146 +829,266 @@ def parse( usecols = maybe_convert_usecols(usecols) if not data: - output[asheetname] = DataFrame() + output[outputDict][asheetname] = DataFrame() continue - is_list_header = False - is_len_one_list_header = False - if is_list_like(header): - assert isinstance(header, Sequence) - is_list_header = True - if len(header) == 1: - is_len_one_list_header = True - - if is_len_one_list_header: - header = cast(Sequence[int], header)[0] - - # forward fill and pull out names for MultiIndex column - header_names = None - if header is not None and is_list_like(header): - assert isinstance(header, Sequence) - - header_names = [] - control_row = [True] * len(data[0]) - - for row in header: - if is_integer(skiprows): - assert isinstance(skiprows, int) - row += skiprows - - if row > len(data) - 1: - raise ValueError( - f"header index {row} exceeds maximum index " - f"{len(data) - 1} of data.", - ) + output = self.parse_multiindex( + data=data, + asheetname=asheetname, + header=header, + output=output, + outputDict=outputDict, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) + if last_sheetname is None: + raise ValueError("Sheet name is an empty list") - data[row], control_row = fill_mi_header(data[row], control_row) + last_tablename = None + outputDict = "tables" - if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) - header_names.append(header_name) + if tables is not None: + sheets_reqd = self.get_sheets_required(tables) + for req_sheet in sheets_reqd: + sheet_tables = self.get_sheet_tables(req_sheet) + for atablename in tables: + last_tablename = atablename + table_data = None - # If there is a MultiIndex header and an index then there is also - # a row containing just the index name(s) - has_index_names = False - if is_list_header and not is_len_one_list_header and index_col is not None: - index_col_list: Sequence[int] - if isinstance(index_col, int): - index_col_list = [index_col] - else: - assert isinstance(index_col, Sequence) - index_col_list = index_col - - # We have to handle mi without names. If any of the entries in the data - # columns are not empty, this is a regular row - assert isinstance(header, Sequence) - if len(header) < len(data): - potential_index_names = data[len(header)] - potential_data = [ - x - for i, x in enumerate(potential_index_names) - if not control_row[i] and i not in index_col_list - ] - has_index_names = all(x == "" or x is None for x in potential_data) - - if is_list_like(index_col): - # Forward fill values for MultiIndex index. - if header is None: - offset = 0 - elif isinstance(header, int): - offset = 1 + header - else: - offset = 1 + max(header) + if atablename in sheet_tables.keys(): + if verbose: + print(f"Reading Table: {atablename}") - # GH34673: if MultiIndex names present and not defined in the header, - # offset needs to be incremented so that forward filling starts - # from the first MI value instead of the name - if has_index_names: - offset += 1 + file_rows_needed = self._calc_rows( + header, index_col, skiprows, nrows + ) + table_data = self.get_table_data( + req_sheet, sheet_tables[atablename], file_rows_needed + ) + tables.remove(atablename) + + usecols = maybe_convert_usecols(usecols) + + if not table_data: + output[outputDict][atablename] = DataFrame() + continue + + output = self.parse_multiindex( + data=table_data, + asheetname=atablename, + header=header, + output=output, + outputDict=outputDict, + names=names, + index_col=index_col, + usecols=usecols, + dtype=dtype, + skiprows=skiprows, + nrows=nrows, + true_values=true_values, + false_values=false_values, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + dtype_backend=dtype_backend, + **kwds, + ) - # Check if we have an empty dataset - # before trying to collect data. - if offset < len(data): - assert isinstance(index_col, Sequence) + if not bool(output["tables"]) and not bool(output["sheets"]): + return DataFrame() - for col in index_col: - last = data[offset][col] + if ret_dict: + if tables is None: + return output["sheets"] + elif sheet_name == 0: + return output["tables"] + else: + return output + elif tables is not None and sheet_name != 0: + return output + elif tables is not None and sheet_name == 0: + return output["tables"][last_tablename] + else: + return output["sheets"][last_sheetname] - for row in range(offset + 1, len(data)): - if data[row][col] == "" or data[row][col] is None: - data[row][col] = last - else: - last = data[row][col] + def parse_multiindex( + self, + data: list[list[Scalar]] | None = None, + asheetname: str | int | None = None, + header: int | Sequence[int] | None = 0, + output: dict | None = None, + outputDict: str | None = None, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | Sequence[int] | None = None, + usecols=None, + dtype: DtypeArg | None = None, + skiprows: Sequence[int] | int | Callable[[int], object] | None = None, + nrows: int | None = None, + true_values: Iterable[Hashable] | None = None, + false_values: Iterable[Hashable] | None = None, + na_values=None, + parse_dates: list | dict | bool = False, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: dict[Hashable, str] | str | None = None, + thousands: str | None = None, + decimal: str = ".", + comment: str | None = None, + skipfooter: int = 0, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + **kwds, + ): + is_list_header = False + is_len_one_list_header = False + if is_list_like(header): + assert isinstance(header, Sequence) + is_list_header = True + if len(header) == 1: + is_len_one_list_header = True + + if is_len_one_list_header: + header = cast(Sequence[int], header)[0] + + # forward fill and pull out names for MultiIndex column + header_names = None + if header is not None and is_list_like(header): + assert isinstance(header, Sequence) + + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + assert isinstance(skiprows, int) + row += skiprows + + if row > len(data) - 1: + raise ValueError( + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", + ) - # GH 12292 : error when read one empty column from excel file - try: - parser = TextParser( - data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - skip_blank_lines=False, # GH 39808 - parse_dates=parse_dates, - date_parser=date_parser, - date_format=date_format, - thousands=thousands, - decimal=decimal, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - dtype_backend=dtype_backend, - **kwds, - ) + data[row], control_row = fill_mi_header(data[row], control_row) - output[asheetname] = parser.read(nrows=nrows) + if index_col is not None: + header_name, _ = pop_header_name(data[row], index_col) + header_names.append(header_name) - if header_names: - output[asheetname].columns = output[asheetname].columns.set_names( - header_names - ) + # If there is a MultiIndex header and an index then there is also + # a row containing just the index name(s) + has_index_names = False + if is_list_header and not is_len_one_list_header and index_col is not None: + index_col_list: Sequence[int] + if isinstance(index_col, int): + index_col_list = [index_col] + else: + assert isinstance(index_col, Sequence) + index_col_list = index_col + + # We have to handle mi without names. If any of the entries in the data + # columns are not empty, this is a regular row + assert isinstance(header, Sequence) + if len(header) < len(data): + potential_index_names = data[len(header)] + potential_data = [ + x + for i, x in enumerate(potential_index_names) + if not control_row[i] and i not in index_col_list + ] + has_index_names = all(x == "" or x is None for x in potential_data) + + if is_list_like(index_col): + # Forward fill values for MultiIndex index. + if header is None: + offset = 0 + elif isinstance(header, int): + offset = 1 + header + else: + offset = 1 + max(header) + + # GH34673: if MultiIndex names present and not defined in the header, + # offset needs to be incremented so that forward filling starts + # from the first MI value instead of the name + if has_index_names: + offset += 1 + + # Check if we have an empty dataset + # before trying to collect data. + if offset < len(data): + assert isinstance(index_col, Sequence) + + for col in index_col: + last = data[offset][col] + + for row in range(offset + 1, len(data)): + if data[row][col] == "" or data[row][col] is None: + data[row][col] = last + else: + last = data[row][col] + + # GH 12292 : error when read one empty column from excel file + try: + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + skip_blank_lines=False, # GH 39808 + parse_dates=parse_dates, + date_parser=date_parser, + date_format=date_format, + thousands=thousands, + decimal=decimal, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + dtype_backend=dtype_backend, + **kwds, + ) - except EmptyDataError: - # No Data, return an empty DataFrame - output[asheetname] = DataFrame() + output[outputDict][asheetname] = parser.read(nrows=nrows) - except Exception as err: - err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) - raise err + if header_names: + output[outputDict][asheetname].columns = output[outputDict][ + asheetname + ].columns.set_names(header_names) - if last_sheetname is None: - raise ValueError("Sheet name is an empty list") + except EmptyDataError: + # No Data, return an empty DataFrame + output[outputDict][asheetname] = DataFrame() - if ret_dict: - return output - else: - return output[last_sheetname] + except Exception as err: + err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:]) + raise err + + return output @doc(storage_options=_shared_docs["storage_options"]) @@ -1585,6 +1757,7 @@ def __fspath__(self): def parse( self, sheet_name: str | int | list[int] | list[str] | None = 0, + table_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, @@ -1747,6 +1920,7 @@ def parse( """ return self._reader.parse( sheet_name=sheet_name, + table_name=table_name, header=header, names=names, index_col=index_col, @@ -1824,6 +1998,10 @@ def sheet_names(self): """ return self._reader.sheet_names + @property + def table_names(self): + return self._reader.table_names + def close(self) -> None: """close io if necessary""" self._reader.close() diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 218a592c22b4a..1954ad5c5f8d2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -610,10 +610,44 @@ def get_sheet_data( ) -> list[list[Scalar]]: if self.book.read_only: sheet.reset_dimensions() + return self.get_data(sheet.rows, file_rows_needed) + @property + def table_names(self) -> list[str]: + tables = None + tables = [] + for sheet in self.book.worksheets: + for table in sheet.tables.values(): + tables.append(table.name) + return tables + + def get_sheets_required(self, tables): + sheets_reqd = [] + for sheet in self.book.worksheets: + for table in sheet.tables.values(): + if table.name in tables: + sheets_reqd.append(sheet) + continue + return sheets_reqd + + def get_sheet_tables(self, sheet): + tables = {} + for table in sheet.tables.values(): + tables[table.name] = table + return tables + + def get_table_data( + self, sheet, tablename, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: + table = sheet[tablename.ref] + return self.get_data(table, file_rows_needed) + + def get_data( + self, input, file_rows_needed: int | None = None + ) -> list[list[Scalar]]: data: list[list[Scalar]] = [] last_row_with_data = -1 - for row_number, row in enumerate(sheet.rows): + for row_number, row in enumerate(input): converted_row = [self._convert_cell(cell) for cell in row] while converted_row and converted_row[-1] == "": # trim trailing empty elements @@ -636,5 +670,4 @@ def get_sheet_data( data_row + (max_width - len(data_row)) * empty_cell for data_row in data ] - return data diff --git a/pandas/tests/io/data/excel/test_tables.xlsm b/pandas/tests/io/data/excel/test_tables.xlsm new file mode 100644 index 0000000000000..0eebb8d5e6da8 Binary files /dev/null and b/pandas/tests/io/data/excel/test_tables.xlsm differ diff --git a/pandas/tests/io/data/excel/test_tables.xlsx b/pandas/tests/io/data/excel/test_tables.xlsx new file mode 100644 index 0000000000000..07b6eb7d8860f Binary files /dev/null and b/pandas/tests/io/data/excel/test_tables.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..d696aefb94f46 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -792,6 +792,7 @@ def test_read_excel_blank_with_header(self, read_ext): expected = DataFrame(columns=["col_1", "col_2"]) actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) + print("This previous failed") def test_exception_message_includes_sheet_name(self, read_ext): # GH 48706 @@ -800,6 +801,86 @@ def test_exception_message_includes_sheet_name(self, read_ext): with pytest.raises(ZeroDivisionError, match=r" \(sheet: Sheet1\)$"): pd.read_excel("test1" + read_ext, usecols=lambda x: 1 / 0, sheet_name=None) + def test_reading_all_tables(self, engine): + if engine == "openpyxl": + expected = { + "Table1": DataFrame([[1, 2]], columns=["Col1", "Col2"]), + "Table2": DataFrame([[100]], columns=["Column1"]), + } + actual = pd.read_excel("test_tables.xlsx", table_name=None) + tm.assert_frame_equal(actual["Table1"], expected["Table1"]) + tm.assert_frame_equal(actual["Table2"], expected["Table2"]) + else: + pytest.skip( + f"Skipped for {engine}, reading tables with this engine is unsupported" + ) + + def test_reading_table(self, engine): + if engine == "openpyxl": + expected = DataFrame([[1, 2]], columns=["Col1", "Col2"]) + actual = pd.read_excel("test_tables.xlsx", table_name="Table1") + tm.assert_frame_equal(actual, expected) + else: + pytest.skip( + f"Skipped for {engine}, reading tables with this engine is unsupported" + ) + + def test_reading_table_and_sheet_no_header(self, engine): + if engine == "openpyxl": + expected = { + "sheets": { + "Sheet1": DataFrame( + [[1, "data", "Col1", "Col2"], [np.nan, np.nan, 1, 2]], + columns=[0, 1, 2, 3], + dtype="object", + ) + }, + "tables": { + "Table1": DataFrame([["Col1", "Col2"], [1, 2]], columns=[0, 1]) + }, + } + actual = pd.read_excel( + "test_tables.xlsx", + sheet_name="Sheet1", + table_name="Table1", + header=None, + dtype="object", + ) + tm.assert_frame_equal( + actual["sheets"]["Sheet1"], expected["sheets"]["Sheet1"] + ) + tm.assert_frame_equal( + actual["tables"]["Table1"], expected["tables"]["Table1"] + ) + else: + pytest.skip( + f"Skipped for {engine}, reading tables with this engine is unsupported" + ) + + def test_reading_table_and_sheet_with_header(self, engine): + if engine == "openpyxl": + expected = { + "sheets": { + "Sheet1": DataFrame( + [[np.nan, np.nan, 1, 2]], columns=[1, "data", "Col1", "Col2"] + ) + }, + "tables": {"Table1": DataFrame([[1, 2]], columns=["Col1", "Col2"])}, + } + actual = pd.read_excel( + "test_tables.xlsx", sheet_name="Sheet1", table_name="Table1" + ) + tm.assert_frame_equal( + actual["sheets"]["Sheet1"], expected["sheets"]["Sheet1"] + ) + tm.assert_frame_equal( + actual["tables"]["Table1"], expected["tables"]["Table1"] + ) + else: + pytest.skip( + f"Skipped for {engine}, reading tables with this engine is unsupported" + ) + @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False