From 35ff5cb133717fa773d719661ad6c27cd7b5b257 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Mon, 16 Jun 2025 22:59:33 +0200 Subject: [PATCH] Use ActualText when getting the text for the text layer --- src/core/evaluator.js | 25 +++++++++++++++++++++++++ test/pdfs/.gitignore | 1 + test/pdfs/issue20007.pdf | Bin 0 -> 11544 bytes test/unit/api_spec.js | 14 ++++++++++++++ 4 files changed, 40 insertions(+) create mode 100644 test/pdfs/issue20007.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index edf02c2c052bf..5cff6bdac149a 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2406,6 +2406,7 @@ class PartialEvaluator { transform: null, fontName: null, hasEOL: false, + span: "", }; // Use a circular buffer (length === 2) to save the last chars in the @@ -3070,6 +3071,19 @@ class PartialEvaluator { textContentItem.str.length = 0; } + function replaceTextContentBySpan() { + const { span, str } = textContentItem; + if (!span) { + return; + } + textContentItem.span = ""; + if (/^\s+$/.test(span)) { + return; + } + str.length = 0; + str.push(span); + } + function enqueueChunk(batch = false) { const length = textContent.items.length; if (length === 0) { @@ -3446,6 +3460,11 @@ class PartialEvaluator { return; case OPS.beginMarkedContent: flushTextContentItem(); + if (args[0]?.name === "Span") { + textContentItem.span = stringToPDFString( + args[1]?.get("ActualText") || "" + ); + } if (includeMarkedContent) { markedContentData.level++; @@ -3457,6 +3476,11 @@ class PartialEvaluator { break; case OPS.beginMarkedContentProps: flushTextContentItem(); + if (args[0]?.name === "Span") { + textContentItem.span = stringToPDFString( + args[1]?.get("ActualText") || "" + ); + } if (includeMarkedContent) { markedContentData.level++; @@ -3474,6 +3498,7 @@ class PartialEvaluator { } break; case OPS.endMarkedContent: + replaceTextContentBySpan(); flushTextContentItem(); if (includeMarkedContent) { if (markedContentData.level === 0) { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index e1a6e57ad44e8..75ccda1bd9b39 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -726,3 +726,4 @@ !chrome-text-selection-markedContent.pdf !bug1963407.pdf !issue19517.pdf +!issue20007.pdf diff --git a/test/pdfs/issue20007.pdf b/test/pdfs/issue20007.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dddf67441a9bb5ca8ecb40bd888c11c50f9d2c5f GIT binary patch literal 11544 zcmaia1zc3y7p^o6-AE1%5;FrscM2#S(o#di&?%sRh_s{Jz)&#K$`K?f2~qGs zSb8Bnz+4LU9!M7lZx?P5M9Twd<86%qgt|@+mJnbr*aZfK0V6-O9gwbymR<-jm!h}; zR6qzS3=@FDfE!^RD3lZU0rb?7Hvd~iS{j6KwL$qH_>V8vmS7kNBCigHXdpdYES>*R z68c9;*}>Tha1EmD4EUgkutwS-K#-dVS356zu&|J*v^3b$3y8@DOc@Wu5DlpyHv z*0R4WVNl6LieV*2UE0`Umi^nNeeuDJ3HQu*B zXjpahrtRO)CKXqL-gA&`D3V^?9*7CZ%8N0`d3@)?NT9Fzf`t^>%<9dASw0SKG*TIu zYENKfpChNOQUr;*n}znr2dv&(1L8{z?Fu-Zpc`tcdtxV|h^b-;d9mT~0rS*hS~JTzWH5rBN$uP%I7>P28y@;d}?3a*w}l%^)+6w@$$$;vcP7{?i- zXZfnmZ6rd8ud87sqN`T&8T#ybj}D9(Bh2 zeJ;(#j2=}bx1GGAkMP6YCS?<3*iEllPZD1eQ=Q48afY ze{~VgeqGf3Zl(P=P z6Y1??jqn5m2nEn+t&8vioBRMGL=WNT1%{})0E+Uzz7>9btD1ozN`79~b-hsV1g2gW z1f$@k0{{u4jCA!v$&}$>_)i&1s*bR6u#`voflZ)*P*@PkFD4`=CJYu36#$GW0CNzo zUY=kS{&YYPT`v!BYcDNJj~_xP&?$c0f`xxA@+&IT4ES^dYyw2}Q(s*H5NO$fe^>^B zQ&3d|L#3sGNPrtISD;lue||)`e_Ix1=wI&sXIVuDAJl?kf3Nq$QJp`n3jbymrTxE* z{wo&Ie@y>~>QB=`znMm9|1Z;uNNaBwAVWY~v~;xtb0J*$^mS2f9W3(qIzNK>a~Xl( zmVy0MoF;$8`zyV$-&*!x`9)QPo(BS`T@7y+Ah%HPkLmwPFRJW*`F+#D#Q`WQlizb9 z!2O>T0q$v7x&R>fuiB68`j@g0_s{hFODN3!t4RJO6#=BELi?9%zrA06)+X!+Mt&k>J5ZHs^b3=2D2ey7I$3M`c zVCiM)jI=`;u>*GF2iSn(w)AjPb+tvJF#W6ge@T9hA#A_^S`jFa|E%Y~T>V)Ezb*J% zXg}YhFfa@b6@~*nQJC*lc9ya;4-4%j;gdDFJ8%^w(X&k#=PmygOEV-Tsd zJL$FmUe&|oBH&;*QIX_zc73ky>boq^c~IaLC2gGyn-4uU%BR~Kr$N&fK8J_FYxi(j z!*A|M=Gf1_)y>U{U7$mRZ*8`0bbU^F@AS>(1QD)6{aK_&I-ouNV{HibKr{Mk=Z#Uu zrk(aq>5JQ8Ld_uyJ1KM`5~o#yZiVp)d&um%H`T|b`EM2$hWFZR5C_u_W=z8$Hu zpB;`7l=Tc_k1CNw2Dj)Le*o1pmv#JMqS&{?17T<8o1QKEd+%%Il-G_1`j8>_gC1-) z;?Lp?6KMpM9lzWR+Q<|-l|kIUV3yfs>YKl`f%=-(9WL4;lDill5b3eo z!dICn#2{?;J*0hUq18x8OLpay%IM26zwxZ#30pAc(eeD@&dxx^o#~*yW%X~l85kGK z8Fg4iN2cLNAhE%hw5qXdYCZRn^zR8=6kiic5R`9R&sb)86Z2GdbjaI_{9BxlFQjGT z`jGxukdK|+QiE@N1EM~#Bs?q0=?hJW?KvH8AGo~K>5EEc2m=Do(QP@FOc=pJJR_j? zkd${Y#4Pyf+j?~h!-Bh0OC(CsUntL3$ek=NKRSKcYn$_lef5@ECLX#P60mw96jw=v zGue~+aO%jraraREU`O~K&wc#pOcv6wef4%At&-f^WB2j=K5A~ZQK;u$_p$WBiMtZe zz23$mi_3S{R2sB1p(rwhD~>mmyL>g_D~HMonxWZs&D?jgwwj>?@3iS+p9d8@u(j~U z=wfdP#9k~0kH5RJguO=GdM>(%NvmA!b|E>reoZ{8?5?yh8sp?CtcuBP^N#p!-4%d5x0hC-W)9^wSF%hue^CIneNtz_GRTGTMpyiMs7HqYP&^QylTT>CkPmnA*L6Co3Dw{jJn3l=wIl4)oZ?_zBtQl?k^3rc+I=uTVt!a_4%8TwowJIhJ{A#^41V873@eUS)!X*LA)>Htw zRCu<+G49)#nvJ&*(jm^RtMK!*mrL#QfBkF131%>hUS#(oLOu`1ymTrEKi-B6g>AN- z3kPhQt$ti)Yz%!GHM6=baOY|COp5QkQlQ|nB4%Utt^m={lV;%)nFz*!(UT_<9`7CY z{NPCW2j#W3;RCX+NAooa}(pk^OPac3bLqtMpOKTaX0&*`NWXbJ!6%k zjN9?F#icWMYV<(E0g+Upoukj1jK&@_;kondiSWmUzgW1v)h=>N!60y}nz@9Z*M%=j zvcUkPHg4h+T=(XU0qAtqNG5Y}Xz%1rvy|*aBMDLK=Z#+CcSpytxZ}m|>BNeY6MG8Y z-YytEcY-w*wBi+-V%M;}tj9@2Mn}U)O3TblZ!_UJ^gTE&|IR_e##>T$UmnU7)he0L z%s3bOD4`ntSzmKGUuO2<@srl}V_y~F)EfB-uGWjtkiKTD!zo7cqL$AOt7AW{E%xkJ zTQCe9XqL7X^4d8b_OL!50=>My=rpvhltXHHwI2sOYY{BSwgkEFhyRob6#Gca@iwp6~ilWB4;m6S~+eh zvT-Lbd>}aIC8@uTP$_O*oSPw?z%gYsn7Ni5^q^V-^1*~u;g!QRGh-e^*oS~zI2moJ zi+pMuenvFw^Ko>=Y2y9+)@L49LXY={W{Q)_1uTTfjNRxY$Q}}tjfE~nEVf02IJFb2 zlT4`C*BU&VwWX#S<$3N7s}m#ZGUnm@Y@k6X{F;5tH|N~XiOvW=f3$|riMbpO=TPi^?JE_#AcS+AFV_WR&1l3WdgG=Qiw5`-??7;6?8p)7T50s7tMC% zC0aVJa6SfO*TpLu3pLi+F_(P{2)&ClzGy`iwq>NCXmGyGRN$MBJDjWmoL7h;O^3`g zmYb(;Z{r(p$F^y1(uRBEM$ZgO#gOw$hS283;1Ho-vLtI6*_$_hBDxcDKsD(! zEs=kIFvG|k7Kon8gR>KkgT6)m>8No0-vOKJPCJ5iR;RANOLYRdn zl`uahv)NeVmHeJITdEmC#;}dIw&W_IE%fx*4A@)Y?Jd+dhjnLyH+K^(NY-gYMZ3+f zgpo~o-l}6=Dxf7rn=TIHIvjW{yKq`haA%Cvu|WXWRt_#W?86nddvq~t@kA!Z$w`>( zs-4BHoaKw^8X5fhN4kv-E1t#&iu?wa-g^UU5?UjtMq6dJ=&oHjKHP?5y6>?y2Yp}H z<@eSb%+%J85GhzZHA5n6EAWZ@whj)yXMH*MwQhzCJU0;idS^OgptddOy~&n^E3fQ& zpt~&JdO2YMel4G~=9|enbm+dYXwQsKGzlCWaY=d8{rFYiiRiK9+kR{k`Hdl8hoH1B zcK#Tt3u4cU8?9wme2(kh?+BD<(ui`}_--lWGx)Sp$KPqW(2lGh^KMhU8(-bV{n(&x zBvrQT!cqkh=pjBew@6cV#ZPlpC{pmsB>t7fmY~v6x2o45*(%S_d0sTP#Cf+Fig&Xu z=kyOS^2G+2uQ1Mt(u=mciPg%?m!091%QGuy97wz-SC#m&1}gaKF5 zd_RWUQ{W~VP(ugQIt}VBYaTG^q#fGXj`wj~bIq3k(F*Qdfe4@_{V~AI(&m+4Ix=FX zGr_jzMvlLF@Y2~!K1L7-+Yl7+; zE03~dz(oF}M)nkl(?U$_Ec*Ft!L)ZprOk`;5*58NWWUpAegEY2h$T^WLXP*LTx_4@ zic*fIkL(7=eg~2=0M7p%ZlZsE(kSg*!}uZoi(t3Q72}+MWyOi)*U>(cAK8UGCq=xs znH|C(SF&rI6fN%w3#4^Sb{OV}x=2ZgG>R3y{_yb?<_n8=%2NT0)LncU`&ewV)WH8P z<)q+~udryeHH11TW}R@!qk57eNud1Shvvr<)2ytDj8?pzI8>=r@^SJoB0iuQIAUxM zT9(t6L-@T0Ebi@mWh}$2pU{e^ToKLFk^dAaWx6$J6D43mC!%}024DS@7gopI-QG{- z#bI(YxFIpNFy(IVw`k0f(z&vv*Vp!O^0M72cpFx990wl0Wb?IGZKJs6mUCD0Za0%0 z>(RS!g7z}$YrbkT1PYJKNn>!arf@k(kuxv?ww7)xD0Tk3{D71lEiwMc9@|fC*^a_Pj}aCB8BqQlHU<+6PA26o+s15kU*FlhO`8Je`& z9eSU%Asu9;@k6{xm%C$_bxIyHD^kxUUyzNOpoNio+(PsAu}pR%yM_2b873T#Cy*dZ zJ&0E$;%Tly)VWJY`&wVgiC6d`BEZ;5*;elr&gX6|gQV7CgiflS$)Hvz8A5^KH9z@I zk=96r>qSk;w>`fo)qbe;6Xgi1#>5%BkK9HdR^2C;)m&&fL#;mSzbLJE!$+meVBKA> zee%fB6R%|B43`NX6w_pHI)1yS)S=MGcQy+JhAHH|&l;xg{~O!iyX zmSEj249kxnv}hb=7abJw+1DsX@^%U7=H)=yaD9^0j>F4_Wwo}0s^>xm@M?@dC)XLh z|7Lx&qDts`LgB<&A$L;G`AXxgb0OcxZDMBp5F(ieUV2O0#o@!G9UsfBGu$s70sDxSHZ6Mm=sq_{--ozr+WOIXT~SDIt;80R3Hp0<#! zx8=*6mK3)FGIDp*cVyde^`Y`Pnx4)u9ruDm&XhIeKrA(;S*=RHJ)&9vsNh#GgVlns4+Yjdbv%U;qeVakf59X34Zv@uETdyBwN18I(=O@(Sl~3@D z`sNx#bksOKLLe(tFgM2?T-UkqBdjk5g^!;Hje6^8Gg2$9SEX@G)fM2IWxu4bkwns$dvZ7PJ|X(Ty$Q$Eb9w33 zyPu+#skRjDibk<~d@)2t6trwzD$Rx6waVli8Egt$xuFwgJu~s4=`V&m8*^UrYOAQm zJ-RVGS7|~~^36)p$O7AduZLGFa)_4HYHCHGqsM0FU4F56y`+_6jqAfFN7CavYW#d% z&JC$F8DJeIv69MO;oDV{F%P;3d{V2C6ZqINNWG;5@64TcV-#(BXUxHTUDR{b z9~*e7P4~(^-#2FOw*8=rt+7;x1LK^WFGHfep7Bg@|N4wz*0s_Ee})GNYatnLy)q$L z$SyIp8IR})N;Dt(dR|RXU@Fa3_r?{!^Ty+cjy|7wPCmV*O7rIBspQ5P>rNGDBT3m6 z({)NFGs4}mE;6SW^^U;ORxiohPY)?K&>Sx+({ey-+@V#QV@k-D=MWF4Jr^G|BKgQ% zV1}b^<&`Cu8rWY0{T^fVU`?UBBd;YIRDM%3l5Kcdw(1Mtyer)+o5Xul2l5wXRQ@>| zP5P?uv-7-onH~qx^*z9n>ryNi>~J@r6Q(wgr_ITx>f#&JyWU%PjwfT{g-;#r z|0pXmExVA?qe50DJ*9X~F+$DV^VqOa{J1zk@=1SX%#7)j9pw`NrrJqQ%(ToN>rY-2 zNn{av$%Hf8lLMDsOzj~|e$(+p2X@{TV{I(GRxI*UUGjUm-101Pb69M+1E8_WzSr~; z(KMdp+s_@fygnjS`-SsORi-f>89sBy-8ILl!aHbknjp6#Cyt>FhYgSOc(-C-Bo$H` zTXdI}Sa$ocQqu8#>y6>&p6lc2ieh<9T3a1I!fm7lUG2juQND4p?s%GU&I=a^DWp50X$#cZR%{tYN!pHm` zY8j2#BuSAJ=-kXzvyB1eq|`#j3`|e<^a}NFQewVKhw1nAk!){`$yi+N{B&PAn6Owa zcMz&rXa3n~tKfjGa(nZUrW}&xtu)(wg)J7D`-^+RTT0JJ(Kx?N%;3PduTa!1O z^M*b1cz{W@-AqB@{Oz@dWFFZQ678hN-VQbL)wFM!$ZCy31%{J#JT$ls1>bNq5Npx} zfX|J};j@n1x;8NUqBJi}ddRTn4RX=Kt1%{{)KBGMGh1K#E|-y!JRt#xfV7D41SpO7Y(6b6O+D|b1HgIN2|QM1l^XiE|`GC z%Y_%nt|{1%aEP|;%MFjxXpA==v}$GTS`CIh@C_7#Rf55fI5%7=@OU?9beo7oagT5= z-e`bb-#tm>PVIEadO_@!{FqVZSy6vVCB2EDX9Nhbfjh8PyEdN7JBLznk#s;LwmB6u-CPbx~fU;L0!gkf3n5dCWA|V^(Y>=S>DYLdyP6n zvCMdCp0ST=Hcomt*iU#2)mJ*SIMN+Nf-T@*>7Ej5*un9f8E8q`BJI;OdO+3s3MLpK zIr_19H;uCcpm_z9oM}8XjBXzOOYe@TpI>vx(QKTZ^-ZR^-Wc*?2|MZ`%Xk?r`R1Nk zEv);g@bqRSaUJ@|iIYOL4PVzY_2|PK< zGZI}1*tgWmOIR@W+}){Lb(eO6la~zUa_-k8%|8=crS$l5V^yqA zuTYKM#z=o?__hL)hm+nGnR2rqQMn$o%4ODw&}KKwdfQBprR}-+^l3iK#il^aC!VT3 zU03YHkpg39=DNu6@>CLC`*4zfkuPg8jbsqfXigmEL?k20z3HA~a5ChZ^-4JI!>4L` z$pzUP(fzhVYFbv7#vCbC%O^S42pW7;1@X)wwtJ+BLVR0x*4f|lJ`yh{a&$a-h6|g+ z-TxFN{w2&x1%hFT#af+?P12K`nJ(p3 zWlOFm>B*!*SCnGa=a7^;iYdd(a7W@a?0Rk5`%5PL&wc3An& zG{MJ@ebIuc;AJ}xC8Sdze1B2#btF(hvS;DY08@J@yKpA@0Z77?oX_En!v~TT@x&=0sf-8dbo&~ zZmoqS^0{!cq%Vgn+*~4iC7im1^Dp#nEhOp%`F!?4yMe&rTLzWy^$<_!9)r+|znhx~sd@#B@HK9!BJg-12l+>Ywq8 zTm0a^&N7?#n9ajTO|{Q;O-qh0k>WZ=Qa8t3o`#}n&^H47vYS=V;j5HH(GLtHHVzq< z5uJ6Z#-9)9AYPf2#5{$Iv91nU(c=9K6qtxJj$p=4W^;o5V)e*Dl1y9*zvQ7Ndzha- znx)Lji#j{~zT8O>P$mY~;`gr;$r7_lsT8Vrv>qKu7G}-}9B#!tsj_zNolmqYQT$ zlis216T6ZUc1`f&(r5r2febb47-I1_IWX5hF>G2CKjRa{rshV64a_DQW+O8eA@HCJ4GXY@8zhqRsFwy?Iesz*ySkF-nmO1(9excfYL zk-sB4t`)yh{1t)|+3fk4RVz=mVe`>le1R3rMDHq2Cn0vro~)BPYx&MBO@~TMtjJZP zt}UG=KD-`d)-S0~Sh!Egd+N2drR({l0ubAewKrakF(z4EhgA3bYuhSY1+3i_3NkhS zp4uDC|Jr;2pF8}5m0v2;^G zIM~^HfrVkfh$?V2?qDtFYUhjq1COZ1<$usw_+Y@nI-h_TiijWxLm36=1gK%)OdBBp zhW(&I{B{Wb2VVoF`*X%W(SU!13!L4{TY4f;3jfcbbiJ*-eh_*93JOXQbwV!ycn=)< zBd@C}s$04NoE;k=LI*GZA7Wj9Paw#@4)lS|aq#r=@CS3r0VFvDK;F>wut9h@py(Wb zO#_6w-fnIHIR>>+0L=vixrHKLmc%Zi2z&x_!a@1z=cH3z#_m1To4sD6f7tP z5Q79z!~-Z8E{vi&05V}AFiaS=4Zyt+FfM{htst;(U@-x(i4d@3f^fhvTu=~Y2sIgH zLQn{G2NyyafrG00|v=x>%62`*t1hNMzyeC=?GIz%mqVxy^Y+9#B{XVei-T*ViFAezJ z3DRa%`>N3&BGOc1VWnQ2a<8J925${>$ska!TaPE7r@ndZd+V-?hfqAZ*Gty`NFU9v zjI&o}f4_mmr|g-Y-RT(gkKYANf*$sIcwdcRT9(y}3G^SJfE%p5WHetTdc67J^UKO@ zpY&1TrQkVF#=H09FxHe0KwZyMEgXZeA@e>9yb=>BfzLiQ`#rXpD10FmpFy z3E$-@L+7JhF85?i6`U8PxuV|55%^fAEu>2-bn>Ln!-rB6{McIY9)UvjNM3}3DLbAH z-P77u!79}12v^DOj3XK5ja=L=b6}bm-(TJ&bVX9ljPBPg9!qg^KU(vs1N|i|H$B?|6KzL6BPS%t|0W!xiC>+L;hn=Nc2xV zVH9EMKYGH#f9i<}{HZ4<{D({cDgrQ={%uc86xEOaC4&JQ^QR1s>J5MCd3jg@z1#!E yV%BvC0I~`2tWaDmppBu@3cMUp^r#=gdj~ literal 0 HcmV?d00001 diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6610de8a7ca3c..6fe47f3a06c93 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3923,6 +3923,20 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) expect(items[1].fontName).not.toEqual(items[0].fontName); }); + it("gets text content from /ActualText", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue20007.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text).toEqual("The quick brown fox jumps over the lazy dog"); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();