@@ -817,95 +817,115 @@ def _parse_sheet(
817
817
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
818
818
** kwds ,
819
819
):
820
- is_list_header = False
821
- is_len_one_list_header = False
822
- if is_list_like (header ):
823
- assert isinstance (header , Sequence )
824
- is_list_header = True
825
- if len (header ) == 1 :
826
- is_len_one_list_header = True
827
-
828
- if is_len_one_list_header :
829
- header = cast (Sequence [int ], header )[0 ]
830
-
831
- # forward fill and pull out names for MultiIndex column
832
- header_names = None
833
- if header is not None and is_list_like (header ):
834
- assert isinstance (header , Sequence )
835
-
836
- header_names = []
837
- control_row = [True ] * len (data [0 ])
838
-
839
- for row in header :
840
- if is_integer (skiprows ):
841
- assert isinstance (skiprows , int )
842
- row += skiprows
843
-
844
- if row > len (data ) - 1 :
845
- raise ValueError (
846
- f"header index { row } exceeds maximum index "
847
- f"{ len (data ) - 1 } of data." ,
848
- )
849
-
850
- data [row ], control_row = fill_mi_header (data [row ], control_row )
820
+ try :
851
821
852
- if index_col is not None :
853
- header_name , _ = pop_header_name (data [row ], index_col )
854
- header_names .append (header_name )
822
+ # header indexes reference rows after removing skiprows, so we
823
+ # create an index map from the without-skiprows to the
824
+ # original indexes.
825
+ if skiprows is None :
826
+ ixmap = range (len (data ))
827
+ elif is_integer (skiprows ):
828
+ ixmap = range (skiprows , len (data ))
829
+ elif is_list_like (skiprows ):
830
+ skiprows = set (skiprows )
831
+ ixmap = [ix for ix , _ in enumerate (data ) if ix not in skiprows ]
832
+ elif callable (skiprows ):
833
+ ixmap = [ix for ix , _ in enumerate (data ) if not skiprows (ix )]
834
+ else :
835
+ raise ValueError (
836
+ "skiprows must be an integer or a list of integers"
837
+ )
838
+ nixs = len (ixmap )
855
839
856
- # If there is a MultiIndex header and an index then there is also
857
- # a row containing just the index name(s)
858
- has_index_names = False
859
- if is_list_header and not is_len_one_list_header and index_col is not None :
840
+ index_col_has_names = False
860
841
index_col_set : set [int ]
861
- if isinstance (index_col , int ):
842
+ if index_col is None :
843
+ index_col_set = set ([])
844
+ elif isinstance (index_col , str ):
845
+ index_col_set = set ([])
846
+ index_col_has_names = True
847
+ elif is_integer (index_col ):
862
848
index_col_set = {index_col }
863
- else :
864
- assert isinstance (index_col , Sequence )
849
+ elif is_list_like (index_col ):
865
850
index_col_set = set (index_col )
866
-
867
- # We have to handle mi without names. If any of the entries in the data
868
- # columns are not empty, this is a regular row
869
- assert isinstance (header , Sequence )
870
- if len (header ) < len (data ):
871
- potential_index_names = data [len (header )]
872
- has_index_names = all (
873
- x == "" or x is None
874
- for i , x in enumerate (potential_index_names )
875
- if not control_row [i ] and i not in index_col_set
851
+ else :
852
+ raise ValueError (
853
+ "index_col must be a string, an integer or a list of integers"
876
854
)
855
+ has_index = len (index_col_set ) > 0
856
+ has_index_names = False
877
857
878
- if is_list_like (index_col ):
879
- # Forward fill values for MultiIndex index.
880
858
if header is None :
859
+ header_list = []
860
+ elif is_integer (header ):
861
+ header_list = [header ]
862
+ elif is_list_like (header ):
863
+ header_list = header
864
+ else :
865
+ raise ValueError (
866
+ "header must be an integer or a list of integers"
867
+ )
868
+
869
+ header_names = []
870
+
871
+ if len (header_list ) == 0 :
881
872
offset = 0
882
- elif isinstance (header , int ):
883
- offset = 1 + header
884
873
else :
885
- offset = 1 + max (header )
874
+ max_header = max (header_list )
875
+ offset = max_header + 1
886
876
887
- # GH34673: if MultiIndex names present and not defined in the header,
888
- # offset needs to be incremented so that forward filling starts
889
- # from the first MI value instead of the name
877
+ if max_header >= nixs :
878
+ raise ValueError (
879
+ f"header index { max_header } exceeds maximum index "
880
+ f"{ nixs - 1 } of data." ,
881
+ )
882
+
883
+ if len (header_list ) > 1 :
884
+ if index_col_has_names :
885
+ raise ValueError (
886
+ "named index_col can not be used together with multi-index header"
887
+ )
888
+
889
+ # Forward fill and pull out names for MultiIndex column
890
+ control_row = [True ] * len (data [0 ])
891
+ for row in header :
892
+ row1 = ixmap [row ]
893
+ data [row1 ], control_row = fill_mi_header (data [row1 ],
894
+ control_row )
895
+
896
+ if has_index :
897
+ header_name , _ = pop_header_name (data [row1 ],
898
+ index_col )
899
+ header_names .append (header_name )
900
+
901
+ # If there is a MultiIndex header and an index then
902
+ # there may also be a row containing just the index
903
+ # name(s)
904
+ if has_index and offset < nixs :
905
+ # We have to handle mi without names. If any
906
+ # of the entries in the data columns are not
907
+ # empty, this is a regular row.
908
+
909
+ potential_index_names = data [ixmap [offset ]]
910
+ has_index_names = all (
911
+ x == "" or x is None
912
+ for i , x in enumerate (potential_index_names )
913
+ if not control_row [i ] and i not in index_col_set
914
+ )
890
915
if has_index_names :
891
916
offset += 1
892
917
893
- # Check if we have an empty dataset
894
- # before trying to collect data.
895
- if offset < len (data ):
896
- assert isinstance (index_col , Sequence )
897
-
898
- for col in index_col :
899
- last = data [offset ][col ]
900
-
901
- for row in range (offset + 1 , len (data )):
902
- if data [row ][col ] == "" or data [row ][col ] is None :
903
- data [row ][col ] = last
918
+ # Forward fill index columns:
919
+ # TODO: forward fill also when index columns are selected by name!!!
920
+ if has_index and offset < nixs :
921
+ for col in index_col_set :
922
+ last = data [ixmap [offset ]][col ]
923
+ for row1 in ixmap [offset + 1 :]:
924
+ if data [row1 ][col ] == "" or data [row1 ][col ] is None :
925
+ data [row1 ][col ] = last
904
926
else :
905
- last = data [row ][col ]
927
+ last = data [row1 ][col ]
906
928
907
- # GH 12292 : error when read one empty column from excel file
908
- try :
909
929
parser = TextParser (
910
930
data ,
911
931
names = names ,
@@ -933,9 +953,8 @@ def _parse_sheet(
933
953
output [asheetname ] = parser .read (nrows = nrows )
934
954
935
955
if header_names :
936
- output [asheetname ].columns = output [asheetname ].columns .set_names (
937
- header_names
938
- )
956
+ output [asheetname ].columns = \
957
+ output [asheetname ].columns .set_names (header_names )
939
958
940
959
except EmptyDataError :
941
960
# No Data, return an empty DataFrame
0 commit comments