Skip to content

Commit 12cadd1

Browse files
committed
Add fast mb_strcut implementation for UTF-8
The old implementation decodes the entire string to pick out the part which should be returned by mb_strcut. This creates significant performance overhead. The new specialized implementation of mb_strcut for UTF-8 usually only examines a few bytes around the starting and ending cut points, meaning it generally runs in constant time. For UTF-8 strings just a few bytes long, the new implementation is around 10% faster (according to microbenchmarks which I ran locally). For strings around 10,000 bytes in length, it is 50-300x faster. (Yes, that is 300x and not 300%.) At the same time, I also added many more unit tests for mb_strcut. This will help to avoid unintended behavior changes as the function undergoes further performance work. The new implementation behaves identically to the old one on VALID UTF-8 strings; a fuzzer was used to help ensure this is the case. On invalid UTF-8 strings, there is a difference: the old implementation would convert invalid UTF-8 byte sequences to error markers ('?'), but the new implementation just cuts a subsequence of bytes out of the source string without performing any conversion on it.
1 parent 2546bbd commit 12cadd1

23 files changed

+417
-31
lines changed

UPGRADING

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@ PHP 8.4 UPGRADE NOTES
5555
5. Changed Functions
5656
========================================
5757

58+
- MBString:
59+
. After performance optimization, mb_strcut returns slightly different
60+
output for invalid UTF-8 strings. Rather than converting invalid UTF-8
61+
byte sequences to an error marker (usually the string '?'), any
62+
invalid sequences embedded in the requested range of bytes are cut
63+
out and returned as is.
64+
5865
- Standard:
5966
. The internal implementation for rounding to integers has been rewritten
6067
to be easier to verify for correctness and to be easier to maintain.

ext/mbstring/libmbfl/filters/mbfilter_7bit.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ const mbfl_encoding mbfl_encoding_7bit = {
6565
&vtbl_wchar_7bit,
6666
mb_7bit_to_wchar,
6767
mb_wchar_to_7bit,
68+
NULL,
6869
NULL
6970
};
7071

ext/mbstring/libmbfl/filters/mbfilter_base64.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ const mbfl_encoding mbfl_encoding_base64 = {
4545
NULL,
4646
mb_base64_to_wchar,
4747
mb_wchar_to_base64,
48+
NULL,
4849
NULL
4950
};
5051

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = {
43924392
&vtbl_wchar_jis,
43934393
mb_iso2022jp_to_wchar,
43944394
mb_wchar_to_jis,
4395-
mb_check_jis
4395+
mb_check_jis,
4396+
NULL
43964397
};
43974398

43984399
static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
@@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
44264427
&vtbl_wchar_2022jp,
44274428
mb_iso2022jp_to_wchar,
44284429
mb_wchar_to_iso2022jp,
4429-
mb_check_iso2022jp
4430+
mb_check_iso2022jp,
4431+
NULL
44304432
};
44314433

44324434
static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
@@ -4462,6 +4464,7 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
44624464
&vtbl_wchar_2022jp_kddi,
44634465
mb_iso2022jp_kddi_to_wchar,
44644466
mb_wchar_to_iso2022jp_kddi,
4467+
NULL,
44654468
NULL
44664469
};
44674470

@@ -4496,6 +4499,7 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
44964499
&vtbl_wchar_2022jp_2004,
44974500
mb_iso2022jp2004_to_wchar,
44984501
mb_wchar_to_iso2022jp2004,
4502+
NULL,
44994503
NULL
45004504
};
45014505

@@ -4581,6 +4585,7 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
45814585
&vtbl_wchar_cp50220,
45824586
mb_cp5022x_to_wchar,
45834587
mb_wchar_to_cp50220,
4588+
NULL,
45844589
NULL
45854590
};
45864591

@@ -4595,6 +4600,7 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
45954600
&vtbl_wchar_cp50221,
45964601
mb_cp5022x_to_wchar,
45974602
mb_wchar_to_cp50221,
4603+
NULL,
45984604
NULL
45994605
};
46004606

@@ -4609,6 +4615,7 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
46094615
&vtbl_wchar_cp50222,
46104616
mb_cp5022x_to_wchar,
46114617
mb_wchar_to_cp50222,
4618+
NULL,
46124619
NULL
46134620
};
46144621

@@ -4645,6 +4652,7 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
46454652
&vtbl_wchar_2022jpms,
46464653
mb_iso2022jpms_to_wchar,
46474654
mb_wchar_to_iso2022jpms,
4655+
NULL,
46484656
NULL
46494657
};
46504658

@@ -4687,6 +4695,7 @@ const mbfl_encoding mbfl_encoding_2022kr = {
46874695
&vtbl_wchar_2022kr,
46884696
mb_iso2022kr_to_wchar,
46894697
mb_wchar_to_iso2022kr,
4698+
NULL,
46904699
NULL
46914700
};
46924701

@@ -7832,6 +7841,7 @@ const mbfl_encoding mbfl_encoding_sjis = {
78327841
&vtbl_wchar_sjis,
78337842
mb_sjis_to_wchar,
78347843
mb_wchar_to_sjis,
7844+
NULL,
78357845
NULL
78367846
};
78377847

@@ -7868,6 +7878,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
78687878
&vtbl_wchar_sjis_mac,
78697879
mb_sjismac_to_wchar,
78707880
mb_wchar_to_sjismac,
7881+
NULL,
78717882
NULL
78727883
};
78737884

@@ -7906,6 +7917,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
79067917
&vtbl_wchar_sjis_docomo,
79077918
mb_sjis_docomo_to_wchar,
79087919
mb_wchar_to_sjis_docomo,
7920+
NULL,
79097921
NULL
79107922
};
79117923

@@ -7940,6 +7952,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
79407952
&vtbl_wchar_sjis_kddi,
79417953
mb_sjis_kddi_to_wchar,
79427954
mb_wchar_to_sjis_kddi,
7955+
NULL,
79437956
NULL
79447957
};
79457958

@@ -7974,6 +7987,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
79747987
&vtbl_wchar_sjis_sb,
79757988
mb_sjis_sb_to_wchar,
79767989
mb_wchar_to_sjis_sb,
7990+
NULL,
79777991
NULL
79787992
};
79797993

@@ -8017,6 +8031,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
80178031
&vtbl_wchar_sjis2004,
80188032
mb_sjis2004_to_wchar,
80198033
mb_wchar_to_sjis2004,
8034+
NULL,
80208035
NULL
80218036
};
80228037

@@ -8103,6 +8118,7 @@ const mbfl_encoding mbfl_encoding_cp932 = {
81038118
&vtbl_wchar_cp932,
81048119
mb_cp932_to_wchar,
81058120
mb_wchar_to_cp932,
8121+
NULL,
81068122
NULL
81078123
};
81088124

@@ -8137,6 +8153,7 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
81378153
&vtbl_wchar_sjiswin,
81388154
mb_cp932_to_wchar,
81398155
mb_wchar_to_sjiswin,
8156+
NULL,
81408157
NULL
81418158
};
81428159

@@ -10346,6 +10363,7 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
1034610363
&vtbl_wchar_eucjp,
1034710364
mb_eucjp_to_wchar,
1034810365
mb_wchar_to_eucjp,
10366+
NULL,
1034910367
NULL
1035010368
};
1035110369

@@ -10382,6 +10400,7 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
1038210400
&vtbl_wchar_eucjp2004,
1038310401
mb_eucjp2004_to_wchar,
1038410402
mb_wchar_to_eucjp2004,
10403+
NULL,
1038510404
NULL
1038610405
};
1038710406

@@ -10418,6 +10437,7 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
1041810437
&vtbl_wchar_eucjpwin,
1041910438
mb_eucjpwin_to_wchar,
1042010439
mb_wchar_to_eucjpwin,
10440+
NULL,
1042110441
NULL
1042210442
};
1042310443

@@ -10454,6 +10474,7 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
1045410474
&vtbl_wchar_cp51932,
1045510475
mb_cp51932_to_wchar,
1045610476
mb_wchar_to_cp51932,
10477+
NULL,
1045710478
NULL
1045810479
};
1045910480

@@ -10509,6 +10530,7 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
1050910530
&vtbl_wchar_euccn,
1051010531
mb_euccn_to_wchar,
1051110532
mb_wchar_to_euccn,
10533+
NULL,
1051210534
NULL
1051310535
};
1051410536

@@ -10545,6 +10567,7 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
1054510567
&vtbl_wchar_euctw,
1054610568
mb_euctw_to_wchar,
1054710569
mb_wchar_to_euctw,
10570+
NULL,
1054810571
NULL
1054910572
};
1055010573

@@ -10581,6 +10604,7 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
1058110604
&vtbl_wchar_euckr,
1058210605
mb_euckr_to_wchar,
1058310606
mb_wchar_to_euckr,
10607+
NULL,
1058410608
NULL
1058510609
};
1058610610

@@ -10640,6 +10664,7 @@ const mbfl_encoding mbfl_encoding_uhc = {
1064010664
&vtbl_wchar_uhc,
1064110665
mb_uhc_to_wchar,
1064210666
mb_wchar_to_uhc,
10667+
NULL,
1064310668
NULL
1064410669
};
1064510670

@@ -11555,6 +11580,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
1155511580
&vtbl_wchar_gb18030,
1155611581
mb_gb18030_to_wchar,
1155711582
mb_wchar_to_gb18030,
11583+
NULL,
1155811584
NULL
1155911585
};
1156011586

@@ -11591,6 +11617,7 @@ const mbfl_encoding mbfl_encoding_cp936 = {
1159111617
&vtbl_wchar_cp936,
1159211618
mb_cp936_to_wchar,
1159311619
mb_wchar_to_cp936,
11620+
NULL,
1159411621
NULL
1159511622
};
1159611623

@@ -12160,6 +12187,7 @@ const mbfl_encoding mbfl_encoding_big5 = {
1216012187
&vtbl_wchar_big5,
1216112188
mb_big5_to_wchar,
1216212189
mb_wchar_to_big5,
12190+
NULL,
1216312191
NULL
1216412192
};
1216512193

@@ -12194,6 +12222,7 @@ const mbfl_encoding mbfl_encoding_cp950 = {
1219412222
&vtbl_wchar_cp950,
1219512223
mb_cp950_to_wchar,
1219612224
mb_wchar_to_cp950,
12225+
NULL,
1219712226
NULL
1219812227
};
1219912228

@@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = {
1256712596
&vtbl_wchar_hz,
1256812597
mb_hz_to_wchar,
1256912598
mb_wchar_to_hz,
12599+
NULL,
1257012600
NULL
1257112601
};

ext/mbstring/libmbfl/filters/mbfilter_htmlent.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ const mbfl_encoding mbfl_encoding_html_ent = {
6767
&vtbl_wchar_html,
6868
mb_htmlent_to_wchar,
6969
mb_wchar_to_htmlent,
70+
NULL,
7071
NULL
7172
};
7273

ext/mbstring/libmbfl/filters/mbfilter_qprint.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const mbfl_encoding mbfl_encoding_qprint = {
4646
NULL,
4747
mb_qprint_to_wchar,
4848
mb_wchar_to_qprint,
49+
NULL,
4950
NULL
5051
};
5152

ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
8787
&vtbl_wchar_##id, \
8888
mb_##id##_to_wchar, \
8989
mb_wchar_to_##id, \
90+
NULL, \
9091
NULL \
9192
}
9293

ext/mbstring/libmbfl/filters/mbfilter_ucs2.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
5757
&vtbl_wchar_ucs2,
5858
mb_ucs2_to_wchar,
5959
mb_wchar_to_ucs2be,
60+
NULL,
6061
NULL
6162
};
6263

@@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
7172
&vtbl_wchar_ucs2be,
7273
mb_ucs2be_to_wchar,
7374
mb_wchar_to_ucs2be,
75+
NULL,
7476
NULL
7577
};
7678

@@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
8587
&vtbl_wchar_ucs2le,
8688
mb_ucs2le_to_wchar,
8789
mb_wchar_to_ucs2le,
90+
NULL,
8891
NULL
8992
};
9093

ext/mbstring/libmbfl/filters/mbfilter_ucs4.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
5757
&vtbl_wchar_ucs4,
5858
mb_ucs4_to_wchar,
5959
mb_wchar_to_ucs4be,
60+
NULL,
6061
NULL
6162
};
6263

@@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
7172
&vtbl_wchar_ucs4be,
7273
mb_ucs4be_to_wchar,
7374
mb_wchar_to_ucs4be,
75+
NULL,
7476
NULL
7577
};
7678

@@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
8587
&vtbl_wchar_ucs4le,
8688
mb_ucs4le_to_wchar,
8789
mb_wchar_to_ucs4le,
90+
NULL,
8891
NULL
8992
};
9093

ext/mbstring/libmbfl/filters/mbfilter_utf16.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
189189
&vtbl_wchar_utf16,
190190
mb_utf16_to_wchar,
191191
mb_wchar_to_utf16be,
192+
NULL,
192193
NULL
193194
};
194195

@@ -203,6 +204,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
203204
&vtbl_wchar_utf16be,
204205
mb_utf16be_to_wchar,
205206
mb_wchar_to_utf16be,
207+
NULL,
206208
NULL
207209
};
208210

@@ -217,6 +219,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
217219
&vtbl_wchar_utf16le,
218220
mb_utf16le_to_wchar,
219221
mb_wchar_to_utf16le,
222+
NULL,
220223
NULL
221224
};
222225

ext/mbstring/libmbfl/filters/mbfilter_utf32.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const mbfl_encoding mbfl_encoding_utf32 = {
5050
&vtbl_wchar_utf32,
5151
mb_utf32_to_wchar,
5252
mb_wchar_to_utf32be,
53+
NULL,
5354
NULL
5455
};
5556

@@ -64,6 +65,7 @@ const mbfl_encoding mbfl_encoding_utf32be = {
6465
&vtbl_wchar_utf32be,
6566
mb_utf32be_to_wchar,
6667
mb_wchar_to_utf32be,
68+
NULL,
6769
NULL
6870
};
6971

@@ -78,6 +80,7 @@ const mbfl_encoding mbfl_encoding_utf32le = {
7880
&vtbl_wchar_utf32le,
7981
mb_utf32le_to_wchar,
8082
mb_wchar_to_utf32le,
83+
NULL,
8184
NULL
8285
};
8386

0 commit comments

Comments
 (0)