@@ -817,95 +817,118 @@ def _parse_sheet(
817
817
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
818
818
** kwds ,
819
819
):
820
- is_list_header = False
821
- is_len_one_list_header = False
822
- if is_list_like (header ):
823
- assert isinstance (header , Sequence )
824
- is_list_header = True
825
- if len (header ) == 1 :
826
- is_len_one_list_header = True
827
-
828
- if is_len_one_list_header :
829
- header = cast (Sequence [int ], header )[0 ]
830
-
831
- # forward fill and pull out names for MultiIndex column
832
- header_names = None
833
- if header is not None and is_list_like (header ):
834
- assert isinstance (header , Sequence )
835
-
836
- header_names = []
837
- control_row = [True ] * len (data [0 ])
838
-
839
- for row in header :
840
- if is_integer (skiprows ):
841
- assert isinstance (skiprows , int )
842
- row += skiprows
843
-
844
- if row > len (data ) - 1 :
845
- raise ValueError (
846
- f"header index { row } exceeds maximum index "
847
- f"{ len (data ) - 1 } of data." ,
848
- )
849
-
850
- data [row ], control_row = fill_mi_header (data [row ], control_row )
820
+ try :
851
821
852
- if index_col is not None :
853
- header_name , _ = pop_header_name (data [row ], index_col )
854
- header_names .append (header_name )
822
+ # header indexes reference rows after removing skiprows, so we
823
+ # create an index map from the without-skiprows to the
824
+ # original indexes.
825
+ if skiprows is None :
826
+ ixmap = list (range (len (data )))
827
+ elif is_integer (skiprows ):
828
+ ixmap = list (range (skiprows , len (data )))
829
+ elif is_list_like (skiprows ):
830
+ skiprows_set = set (cast (Sequence [int ], skiprows ))
831
+ ixmap = [ix for ix , _ in enumerate (data ) if ix not in skiprows_set ]
832
+ elif callable (skiprows ):
833
+ ixmap = [ix for ix , _ in enumerate (data ) if not skiprows (ix )]
834
+ else :
835
+ raise ValueError (
836
+ "skiprows must be an integer or a list of integers"
837
+ )
838
+ nixs = len (ixmap )
855
839
856
- # If there is a MultiIndex header and an index then there is also
857
- # a row containing just the index name(s)
858
- has_index_names = False
859
- if is_list_header and not is_len_one_list_header and index_col is not None :
840
+ index_col_has_names = False
860
841
index_col_set : set [int ]
861
- if isinstance (index_col , int ):
842
+ if index_col is None :
843
+ index_col_set = set ()
844
+ elif isinstance (index_col , str ):
845
+ index_col_set = set ()
846
+ index_col_has_names = True
847
+ elif isinstance (index_col , int ):
862
848
index_col_set = {index_col }
863
- else :
864
- assert isinstance (index_col , Sequence )
849
+ elif is_list_like (index_col ):
865
850
index_col_set = set (index_col )
866
-
867
- # We have to handle mi without names. If any of the entries in the data
868
- # columns are not empty, this is a regular row
869
- assert isinstance (header , Sequence )
870
- if len (header ) < len (data ):
871
- potential_index_names = data [len (header )]
872
- has_index_names = all (
873
- x == "" or x is None
874
- for i , x in enumerate (potential_index_names )
875
- if not control_row [i ] and i not in index_col_set
851
+ else :
852
+ raise ValueError (
853
+ "index_col must be a string, an integer or a list of integers"
876
854
)
855
+ has_index = len (index_col_set ) > 0
856
+ has_index_names = False
877
857
878
- if is_list_like (index_col ):
879
- # Forward fill values for MultiIndex index.
858
+ header_list : Sequence [int ]
880
859
if header is None :
881
- offset = 0
860
+ header_list = []
882
861
elif isinstance (header , int ):
883
- offset = 1 + header
862
+ header_list = [header ]
863
+ elif is_list_like (header ):
864
+ header_list = header
884
865
else :
885
- offset = 1 + max (header )
866
+ raise ValueError (
867
+ "header must be an integer or a list of integers"
868
+ )
886
869
887
- # GH34673: if MultiIndex names present and not defined in the header,
888
- # offset needs to be incremented so that forward filling starts
889
- # from the first MI value instead of the name
890
- if has_index_names :
891
- offset += 1
870
+ header_names = []
871
+
872
+ if len (header_list ) == 0 :
873
+ offset = 0
874
+ else :
875
+ max_header = max (header_list )
876
+ offset = max_header + 1
892
877
893
- # Check if we have an empty dataset
894
- # before trying to collect data.
895
- if offset < len (data ):
896
- assert isinstance (index_col , Sequence )
878
+ if max_header >= nixs :
879
+ raise ValueError (
880
+ f"header index { max_header } exceeds maximum index "
881
+ f"{ nixs - 1 } of data." ,
882
+ )
897
883
898
- for col in index_col :
899
- last = data [offset ][col ]
884
+ if len (header_list ) > 1 :
885
+ if index_col_has_names :
886
+ raise ValueError (
887
+ "named index_col can not be used together "
888
+ "with multi-index header"
889
+ )
890
+
891
+ # Forward fill and pull out names for MultiIndex column
892
+ control_row = [True ] * len (data [0 ])
893
+ for row in header_list :
894
+ row1 = ixmap [row ]
895
+ data [row1 ], control_row = fill_mi_header (data [row1 ],
896
+ control_row )
897
+
898
+ if has_index :
899
+ header_name , _ = pop_header_name (data [row1 ],
900
+ sorted (index_col_set ))
901
+ if header_name :
902
+ header_names .append (header_name )
903
+
904
+ # If there is a MultiIndex header and an index then
905
+ # there may also be a row containing just the index
906
+ # name(s)
907
+ if has_index and offset < nixs :
908
+ # We have to handle mi without names. If any
909
+ # of the entries in the data columns are not
910
+ # empty, this is a regular row.
911
+
912
+ potential_index_names = data [ixmap [offset ]]
913
+ has_index_names = all (
914
+ x == "" or x is None
915
+ for i , x in enumerate (potential_index_names )
916
+ if not control_row [i ] and i not in index_col_set
917
+ )
918
+ if has_index_names :
919
+ offset += 1
900
920
901
- for row in range (offset + 1 , len (data )):
902
- if data [row ][col ] == "" or data [row ][col ] is None :
903
- data [row ][col ] = last
921
+ # Forward fill index columns:
922
+ # TODO: forward fill also when index columns are selected by name!!!
923
+ if has_index and offset < nixs :
924
+ for col in index_col_set :
925
+ last = data [ixmap [offset ]][col ]
926
+ for row1 in ixmap [offset + 1 :]:
927
+ if data [row1 ][col ] == "" or data [row1 ][col ] is None :
928
+ data [row1 ][col ] = last
904
929
else :
905
- last = data [row ][col ]
930
+ last = data [row1 ][col ]
906
931
907
- # GH 12292 : error when read one empty column from excel file
908
- try :
909
932
parser = TextParser (
910
933
data ,
911
934
names = names ,
@@ -933,9 +956,8 @@ def _parse_sheet(
933
956
output [asheetname ] = parser .read (nrows = nrows )
934
957
935
958
if header_names :
936
- output [asheetname ].columns = output [asheetname ].columns .set_names (
937
- header_names
938
- )
959
+ output [asheetname ].columns = \
960
+ output [asheetname ].columns .set_names (header_names )
939
961
940
962
except EmptyDataError :
941
963
# No Data, return an empty DataFrame
0 commit comments