@@ -817,95 +817,118 @@ def _parse_sheet(
817817 dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
818818 ** kwds ,
819819 ):
820- is_list_header = False
821- is_len_one_list_header = False
822- if is_list_like (header ):
823- assert isinstance (header , Sequence )
824- is_list_header = True
825- if len (header ) == 1 :
826- is_len_one_list_header = True
827-
828- if is_len_one_list_header :
829- header = cast (Sequence [int ], header )[0 ]
830-
831- # forward fill and pull out names for MultiIndex column
832- header_names = None
833- if header is not None and is_list_like (header ):
834- assert isinstance (header , Sequence )
835-
836- header_names = []
837- control_row = [True ] * len (data [0 ])
838-
839- for row in header :
840- if is_integer (skiprows ):
841- assert isinstance (skiprows , int )
842- row += skiprows
843-
844- if row > len (data ) - 1 :
845- raise ValueError (
846- f"header index { row } exceeds maximum index "
847- f"{ len (data ) - 1 } of data." ,
848- )
849-
850- data [row ], control_row = fill_mi_header (data [row ], control_row )
820+ try :
851821
852- if index_col is not None :
853- header_name , _ = pop_header_name (data [row ], index_col )
854- header_names .append (header_name )
822+ # header indexes reference rows after removing skiprows, so we
823+ # create an index map from the without-skiprows to the
824+ # original indexes.
825+ if skiprows is None :
826+ ixmap = list (range (len (data )))
827+ elif is_integer (skiprows ):
828+ ixmap = list (range (skiprows , len (data )))
829+ elif is_list_like (skiprows ):
830+ skiprows_set = set (cast (Sequence [int ], skiprows ))
831+ ixmap = [ix for ix , _ in enumerate (data ) if ix not in skiprows_set ]
832+ elif callable (skiprows ):
833+ ixmap = [ix for ix , _ in enumerate (data ) if not skiprows (ix )]
834+ else :
835+ raise ValueError (
836+ "skiprows must be an integer or a list of integers"
837+ )
838+ nixs = len (ixmap )
855839
856- # If there is a MultiIndex header and an index then there is also
857- # a row containing just the index name(s)
858- has_index_names = False
859- if is_list_header and not is_len_one_list_header and index_col is not None :
840+ index_col_has_names = False
860841 index_col_set : set [int ]
861- if isinstance (index_col , int ):
842+ if index_col is None :
843+ index_col_set = set ()
844+ elif isinstance (index_col , str ):
845+ index_col_set = set ()
846+ index_col_has_names = True
847+ elif isinstance (index_col , int ):
862848 index_col_set = {index_col }
863- else :
864- assert isinstance (index_col , Sequence )
849+ elif is_list_like (index_col ):
865850 index_col_set = set (index_col )
866-
867- # We have to handle mi without names. If any of the entries in the data
868- # columns are not empty, this is a regular row
869- assert isinstance (header , Sequence )
870- if len (header ) < len (data ):
871- potential_index_names = data [len (header )]
872- has_index_names = all (
873- x == "" or x is None
874- for i , x in enumerate (potential_index_names )
875- if not control_row [i ] and i not in index_col_set
851+ else :
852+ raise ValueError (
853+ "index_col must be a string, an integer or a list of integers"
876854 )
855+ has_index = len (index_col_set ) > 0
856+ has_index_names = False
877857
878- if is_list_like (index_col ):
879- # Forward fill values for MultiIndex index.
858+ header_list : Sequence [int ]
880859 if header is None :
881- offset = 0
860+ header_list = []
882861 elif isinstance (header , int ):
883- offset = 1 + header
862+ header_list = [header ]
863+ elif is_list_like (header ):
864+ header_list = header
884865 else :
885- offset = 1 + max (header )
866+ raise ValueError (
867+ "header must be an integer or a list of integers"
868+ )
886869
887- # GH34673: if MultiIndex names present and not defined in the header,
888- # offset needs to be incremented so that forward filling starts
889- # from the first MI value instead of the name
890- if has_index_names :
891- offset += 1
870+ header_names = []
871+
872+ if len (header_list ) == 0 :
873+ offset = 0
874+ else :
875+ max_header = max (header_list )
876+ offset = max_header + 1
892877
893- # Check if we have an empty dataset
894- # before trying to collect data.
895- if offset < len (data ):
896- assert isinstance (index_col , Sequence )
878+ if max_header >= nixs :
879+ raise ValueError (
880+ f"header index { max_header } exceeds maximum index "
881+ f"{ nixs - 1 } of data." ,
882+ )
897883
898- for col in index_col :
899- last = data [offset ][col ]
884+ if len (header_list ) > 1 :
885+ if index_col_has_names :
886+ raise ValueError (
887+ "named index_col can not be used together "
888+ "with multi-index header"
889+ )
890+
891+ # Forward fill and pull out names for MultiIndex column
892+ control_row = [True ] * len (data [0 ])
893+ for row in header_list :
894+ row1 = ixmap [row ]
895+ data [row1 ], control_row = fill_mi_header (data [row1 ],
896+ control_row )
897+
898+ if has_index :
899+ header_name , _ = pop_header_name (data [row1 ],
900+ sorted (index_col_set ))
901+ if header_name :
902+ header_names .append (header_name )
903+
904+ # If there is a MultiIndex header and an index then
905+ # there may also be a row containing just the index
906+ # name(s)
907+ if has_index and offset < nixs :
908+ # We have to handle mi without names. If any
909+ # of the entries in the data columns are not
910+ # empty, this is a regular row.
911+
912+ potential_index_names = data [ixmap [offset ]]
913+ has_index_names = all (
914+ x == "" or x is None
915+ for i , x in enumerate (potential_index_names )
916+ if not control_row [i ] and i not in index_col_set
917+ )
918+ if has_index_names :
919+ offset += 1
900920
901- for row in range (offset + 1 , len (data )):
902- if data [row ][col ] == "" or data [row ][col ] is None :
903- data [row ][col ] = last
921+ # Forward fill index columns:
922+ # TODO: forward fill also when index columns are selected by name!!!
923+ if has_index and offset < nixs :
924+ for col in index_col_set :
925+ last = data [ixmap [offset ]][col ]
926+ for row1 in ixmap [offset + 1 :]:
927+ if data [row1 ][col ] == "" or data [row1 ][col ] is None :
928+ data [row1 ][col ] = last
904929 else :
905- last = data [row ][col ]
930+ last = data [row1 ][col ]
906931
907- # GH 12292 : error when read one empty column from excel file
908- try :
909932 parser = TextParser (
910933 data ,
911934 names = names ,
@@ -933,9 +956,8 @@ def _parse_sheet(
933956 output [asheetname ] = parser .read (nrows = nrows )
934957
935958 if header_names :
936- output [asheetname ].columns = output [asheetname ].columns .set_names (
937- header_names
938- )
959+ output [asheetname ].columns = \
960+ output [asheetname ].columns .set_names (header_names )
939961
940962 except EmptyDataError :
941963 # No Data, return an empty DataFrame
0 commit comments