From 3356275658b3958b952390892b7e88c1c7ad19d1 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Jun 2022 11:42:03 +0200 Subject: [PATCH 1/3] BUG: read_excel raising uncontrolled IndexError when header references non-existing rows --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/excel/_base.py | 5 +++++ pandas/tests/io/data/excel/df_header_oob.xlsx | Bin 0 -> 5605 bytes pandas/tests/io/excel/test_readers.py | 6 ++++++ 4 files changed, 12 insertions(+) create mode 100644 pandas/tests/io/data/excel/df_header_oob.xlsx diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 76f6e864a174f..9a3ebcab4360f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -863,6 +863,7 @@ I/O - Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) +- Bug in :func:`read_excel` raising uncontrolled ``IndexError`` when ``header`` references non-existing rows (:issue:`43134`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) - Bug in :func:`read_csv` when data is longer than header leading to issues with callables in ``usecols`` expecting strings (:issue:`46997`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d20f347e54d6b..b733a4c3debc8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -774,6 +774,11 @@ def parse( assert isinstance(skiprows, int) row += skiprows + if row > len(data) - 1: + raise ValueError( + f"Header index {row} references non-existing rows.", + ) + data[row], control_row = fill_mi_header(data[row], control_row) if index_col is not None: diff --git a/pandas/tests/io/data/excel/df_header_oob.xlsx b/pandas/tests/io/data/excel/df_header_oob.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..1e26091cd2ace45a96e3ac00cb12d80252b304e8 GIT binary patch literal 5605 zcmaJ_1z1$;)&`{8A*30Q5(S1H5Tpd@ZjeTDXz8IrKtLs=k&=`iq(izD2Wf^zrMu$} z=iL9w_5Q~-&&2cWS!?g_UG=V|s&EUP6a^a_8)aN^SQF)j;30n-yI9)0a&caNmc)Ki zZU^CqZTUp9AMwwKV`t?xD9Sc4@X@t0IZ#(eUz)u~C^|eq zfLIiaBDC}((3)7xq4^#Cm(E^X0)yFpUAMtc)E%YRU0$ik?+?sN^yT`vy_U!Qv8SXk zjQZVt-2z2a{DsR?Ef1bci|~~Yz4a)7vz2!CxIcPhh!Vt(-$9O^#`_WPf(~OGgL{v> zVvcV!)sy~F5>%tCMA*7*R8lXneAf^ zHUdIGR&G8@ccHS1oA1bId5z^?>SvtyuaKxJU}1-*@TF`bK^Q_qK~eojn23=7@U-Xh zaCEUVb9A)h^n}<)YfQOJf{3A8I-dK4%})0~N%;JpvB?OQ+})s-GDel7}^}zT^SnOk=s%J@1A6YEk6}_dRIraPD6BrK~ z^qLGZU_uu9^djD6_5J zUzCHcg0uS7m&%7vDlkK7!%=(Rj8Dn)y)AkB{`K}a|I$w0$$(vUx>GW*|AG8j@eWT0 zeTy~?WfNi zB-ZaoO*}|@G9Z;PQxhCH!IWwB9z_21_KUW{IIB$W5@~`HM`4$TYbq*rf%S=|3JF6E zLF^CHfuR|pWXVij&luQDHY0l&4nZRh%6oUP!*)ranvWUmg(%s?eg*+YL)@cVIKSIE ziZimtwga)-$nOjt4`)9tQ;$vA^zW6691v#(<}uD@DX$FAP*G5vZv8242>y;6S2r(vOV{hD+0gs!00+@D?~%UUdSLU`om!qZI*XCh z+AKXJgxH0IwPI!b&7uQ`V6D^mUEmJk^jzyhm}3ZSjm_86;O8W^YTk0^Cr1~4#*eOS z7QMT3;)Cu#aB0)K^YPFg8-(X6t2-Tf=K=wj!gh(!4QI2e#p?#Y=)NT3N;HrBOcKSc zN~wzojfkJ%?e7al&u5!mq(qfp%STH++Pd=X7`BeO7!>@T~bc5hpc7>CHcgukFPC;Fr7gcmV!(9${e zVG!?R;`xX+99t)4k*1bwJnj`ICXiuccC$Ch2eQg8tyU;CXV({2XxiLcjsVKo@n_7R~TQQTMe+$msNxatV&n?+0tpbp=5t5^^Mw#t@9j{xSVH+?3^-n8I?iv{pw27=0 zd)$o{ZJ^vB$;Mf<8Sqk-qk5H3kQf^$Qb=l_ajgw|n5G@z{i$q%W?wY&ZXo~=UtIulZ3FtiIQGEyUYhUi*#dh zk=%T(dyKi5>8gM#QZ9@P98PE8OGthIlxXmlL>tCQf%+P1lvTgsmU=8Y70+k9z>d2o zVG0(NY)WHjmQakhEC6!yP%YyO>^9D`#wUFamKxf9DDhZ{;Ix;DvIPYRvUa2#LY7>< zwvbi0;`d55t=iQQU#j&Oiu~JEG6P+9#kb+%HXVFsRNSfI&PO#rQ@=tW2~-}%BjBmr z73j4#O5WCsWAG@a> zyH8g8D1UrWI@chIM;g$1?m$bCsh<>Js=!(W*=gT?lPyjyRP|MoptVoT#_(@kE1| zmkVTUW@VCnQ+?kd6k8PilpI;a0(rj+*XV1%x&)mFpKxFBT}-pS(DDJ>m-=iucf7J@o18Qn$sdc z8Th^_8RjPMon9ws+OJEZw=2hketBBzHx3q05?SITih#woT(}>_!6^NGISXq#B>_V$ zex&=pL{;%tArO|!&)uCbsAuDFEAZ8ZwmW9cfBL6gawD8~nDC+!uuv40&m{|21@JBP z=K#F{I!T}8homD&6+2nt6I$7L=@aHJy50$#NUrMuu`|LUZmozx`+uG`6MtcWx!KS}lA`kCKHdU_n1YUS?JNQ#(}OCWwbeKNSW;Cq%H3IK zp29=exvGgXB;a?)9>W_V!w(0-T5u4(L*sE6vm61&N^k7vqIRl)6Sqt$KFE&@9S3i% zr1=Ft4ydfhyE5s~H7B;vKxxNHD<)UZI>6OPV^4eDVlWyR6!)#!&~*~c0{nzuD+R4~ zF}uPsaHqpcIGq&e_5SGq_j_T*S%m{c3yOczF(Po!C4muFF4DVc=sfl0e543|5^&fk zVB=hT3i=@A|CJ?riZ|3`+{Z{u6+FJ|<0BZQf;|{;OP*{34fe>yu9TF^|HnvDT)x`@ zx|qqcNOPB*m)glQghheiF3&Ur$&B-{gcCA~?-eh38-jQn!p(b)P2vqr;~Qh#0!N5N zClVI#t?X=PiF3cWJvbgB$)g*s6=;VFyGBxs955;()NWo$|p9ki65d_=n!Ha0-DaCbOn9%K0UikXT& zqB--8VUFqM>ta45_vmen9IUYUZ;Q?p0k;kJQ;A2u=6Kb_h<}NF`5G zf=;VB;l-Doomm?COAS0e;Kuwy5Di!(d??GbJ=>KuYfY0$)EFCbXOAp`0JVDrhSelK z+uQc6M|p0T2dzdWl#)De@e4Jknj4%NCf1zjTFS{43K`SX**?|O{m_Ftbcp?R(gxp! zG571|tD1U4YNlCt-A$f|X&98lSEZdZ)HHsF*5?7pKypOYKmuqDuU0;f#A7B7^l(s8 zJ9X=>G{#Vaz*i(*P1?LxFBnWqtiMR8?>03lW;RNzwBs9FxoTzv0%+ypj&qoo$v#ED zg#hlb7#qp#(nPk$J3B3nBqiz{+W5rpx(!qf972J4Llj};!zokFW0Z2F$nlQRBIvEL}!{Y9}a$YhQ><*>}7z)evSWf#Y&U1|$o zIs{@ztFQbxsSR)~*T2Ml8QW`?Jy_+7UJdV%!pAJiMi2ee>5knLc(J)LPpLhajhM4$ zepOBnC~hPf54a=8rC+rN*5fp5d;RcfZm+y@;aG2d@P-;!|E?T74Rw%Aqr~7A%;COj zgKOJWx%n6Yp>dASJ5Ub+&Y%n38{T2{S7vkfxp=VlAt%eC%VLbQ<9GYYg5RkNc;I|9 zenz+Djpv9nRif&*e$KSFTazqZQihoEMY~2uW&6bYEArgTf27f*|E5FMrY@Ei8g4E& z4px7$BK{a9+%^z#=urma03xrR_?}HYu}B~tFxQ;gnuVK}!`mlkZgU|lA)mrQ)^5A&%EH@86X6IiU~gMx%M*Uo-pgZ57;^x2E+zRY6;U6l;FRxsV1 z7M!R4<3!iTOJRFck&b-$PUQQar-a+cs}_#tsxFRBu3YAhE|%A=2&&$X9I1#K_sHIC zMeMP?*HvOiU5${()(@8c+};I%qaC_7_>t&FIy<3ojnvvVwmWE3U}Dg}Ee!vO5WGK*~C z+cl#BMf($2qORE=YJW1NB@iwZAnYFDPJJYp%uSrUyn9|*@4x!wpw6k-ETHnF;&vOU z>a98}f}X7DRuR@3k2U7L1rXtd)541#;gr`)u^nS&uIFn_ydoXdebbVbHC*Yo=;=VL zLDIx71kYM8eZZgsdt?{qBaP^*+MmpKmz4F@A=-z>281U{R-!{DQdZw?pZUYaiSqs2 zs$&>q2YM{rc_vN1=xA^E&d?9U>KQ*Dcsf<@<%9K%kT*Ianm~}5W{%-}kGqQN+F=%> zCJHV{dcO3>cqG2*FjFU|Yk#G`R#8FbXXlo0B>QBwQlUJhMhZ>{SSd(KqCtznxhC^m zV?lYUhBW}TF);Y~^XY`2JrtG)7FV2X$e7mVzt<{P2Aeuqf_mOd%#~$J19Uz7(TfAn z+oL`thX#aGslUPDJg|60p&idshjm{)0g{;B`{X{Tzc$A&1r}n(Wras*qcKJsdN)WT zv(E`LR%~H4;3|}z%YRdWzWtsBoz<;z6Ld7IeRV>*P)Znv-_h$ct4=y8kUs9fo9*b! zsO{5+V0SHp8J$71oEcx5ntq4%$pu@J0Wsge$CINk^Yy5;1fq=EDVjRE{;p}keZE2K zNF02SX#I0&liWng{JE=}Bjg&aYPgbP5QqY6`|J_MPg(<2O?(qlzK84O9f>|Vk$k}) zZHlS&4rg@2ehgc{r{Ia_{u^-uDgL7guG#XupUme7h@IG28LiHVgJ;|~lml}JR)k8? z^SP}HF!z@3d230kTT*7av~Woy5#N}I`sD`U;lQ;|0< zPM^ii^aS#Fqfde6$L=%My|Es{*N1N2Es)$ng=|;PMbc_hFZ||8;9w;RMrjMK#+Vf1 zE5VET4rz^$^Y`yc+dEgO!Id-Xw_YqxB77N^o7du^MUwTH#b9a?f^?Y-lD8X`rOWSJ z*%0|m897Pys&T{Jz|_pNHZL2Tb25BBMr)8QZV7X6i+yR7nhhx!T6gpk1pk;%9G&hh z=_$`5lIa$$XQ<6CjP?eB>jIdyH&UB6KCKBV_#qL;!WM!RQcEDwFaF~wdF=rMDMtr4 zO9wY&O)n=)*Qb9mUClw2b|7iXR+)8eO54gO3b+H7SWJC~mjHGdN2xFR`1GYu);G)K zYFE9*h*qB#E+b$n#3KE}C*dru7#7bcfU2TvRv%5%1G=wve5o;&oudH?l4UMJMU8@j zAzET_@4>d*dM|aRJd%K-bgrQWH%tAkadQ{By(YEzEib zB?gm>w&w<)7?L&C=nL@)miSp1Jm*B7%BQCIS~vK@y(7-bnxvHOgqpPS8}y=l8q&?2 z)?Z*)Po|Kd!2ohY~#s zgl;~fM-{7?tlj3-1~ajM?5N44 z7<-@MTsvvuVGWT5(n%v2*J*-^MvC%lp>}hJbiGjfZ@IBv`(5$o65)Eq^h;QgxB6}2 z^t Date: Fri, 17 Jun 2022 11:44:20 +0200 Subject: [PATCH 2/3] Fix issue number --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9a3ebcab4360f..fa95ad06bd7ca 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -863,7 +863,7 @@ I/O - Bug in :func:`read_csv` not respecting a specified converter to index columns in all cases (:issue:`40589`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) -- Bug in :func:`read_excel` raising uncontrolled ``IndexError`` when ``header`` references non-existing rows (:issue:`43134`) +- Bug in :func:`read_excel` raising uncontrolled ``IndexError`` when ``header`` references non-existing rows (:issue:`43143`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) - Bug in :func:`read_csv` when data is longer than header leading to issues with callables in ``usecols`` expecting strings (:issue:`46997`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) From 8fab1c09da7a49603c0d7b2b3c636878aac6e7bc Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 20 Jun 2022 23:43:21 +0200 Subject: [PATCH 3/3] Change message --- pandas/io/excel/_base.py | 3 ++- pandas/tests/io/excel/test_readers.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b733a4c3debc8..24b881bda4805 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -776,7 +776,8 @@ def parse( if row > len(data) - 1: raise ValueError( - f"Header index {row} references non-existing rows.", + f"header index {row} exceeds maximum index " + f"{len(data) - 1} of data.", ) data[row], control_row = fill_mi_header(data[row], control_row) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fe165f4ad3242..4ca34bec0a7d9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1559,7 +1559,7 @@ def test_excel_read_binary_via_read_excel(self, read_ext, engine): def test_read_excel_header_index_out_of_range(self, engine): # GH#43143 with open("df_header_oob.xlsx", "rb") as f: - with pytest.raises(ValueError, match="non-existing"): + with pytest.raises(ValueError, match="exceeds maximum"): pd.read_excel(f, header=[0, 1]) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])