Skip to content

Add fast mb_strcut implementation for UTF-8 #12337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions UPGRADING
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ PHP 8.4 UPGRADE NOTES
5. Changed Functions
========================================

- MBString:
. After performance optimization, mb_strcut sometimes returns slightly different
output for invalid UTF-8 strings. The previous implementation would sometimes
remove invalid UTF-8 bytes just after the cut points, but in other cases, it
would pass them through to the output unchanged. The new implementation
always backs up to the preceding non-continuation byte (a byte which starts
a UTF-8 character) before the starting and ending cut points, if the byte
immediately after the cut point is a continuation byte. (For valid UTF-8
strings, the output of mb_strcut is unchanged.)

- Standard:
. The internal implementation for rounding to integers has been rewritten
to be easier to verify for correctness and to be easier to maintain.
Expand Down
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_7bit.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ const mbfl_encoding mbfl_encoding_7bit = {
&vtbl_wchar_7bit,
mb_7bit_to_wchar,
mb_wchar_to_7bit,
NULL,
NULL
};

Expand Down
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_base64.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ const mbfl_encoding mbfl_encoding_base64 = {
NULL,
mb_base64_to_wchar,
mb_wchar_to_base64,
NULL,
NULL
};

Expand Down
34 changes: 32 additions & 2 deletions ext/mbstring/libmbfl/filters/mbfilter_cjk.c
Original file line number Diff line number Diff line change
Expand Up @@ -4392,7 +4392,8 @@ const mbfl_encoding mbfl_encoding_jis = {
&vtbl_wchar_jis,
mb_iso2022jp_to_wchar,
mb_wchar_to_jis,
mb_check_jis
mb_check_jis,
NULL
};

static const struct mbfl_convert_vtbl vtbl_2022jp_wchar = {
Expand Down Expand Up @@ -4426,7 +4427,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
&vtbl_wchar_2022jp,
mb_iso2022jp_to_wchar,
mb_wchar_to_iso2022jp,
mb_check_iso2022jp
mb_check_iso2022jp,
NULL
};

static const char *mbfl_encoding_2022jp_kddi_aliases[] = {"ISO-2022-JP-KDDI", NULL};
Expand Down Expand Up @@ -4462,6 +4464,7 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
&vtbl_wchar_2022jp_kddi,
mb_iso2022jp_kddi_to_wchar,
mb_wchar_to_iso2022jp_kddi,
NULL,
NULL
};

Expand Down Expand Up @@ -4496,6 +4499,7 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
&vtbl_wchar_2022jp_2004,
mb_iso2022jp2004_to_wchar,
mb_wchar_to_iso2022jp2004,
NULL,
NULL
};

Expand Down Expand Up @@ -4581,6 +4585,7 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
&vtbl_wchar_cp50220,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50220,
NULL,
NULL
};

Expand All @@ -4595,6 +4600,7 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
&vtbl_wchar_cp50221,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50221,
NULL,
NULL
};

Expand All @@ -4609,6 +4615,7 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
&vtbl_wchar_cp50222,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50222,
NULL,
NULL
};

Expand Down Expand Up @@ -4645,6 +4652,7 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
&vtbl_wchar_2022jpms,
mb_iso2022jpms_to_wchar,
mb_wchar_to_iso2022jpms,
NULL,
NULL
};

Expand Down Expand Up @@ -4687,6 +4695,7 @@ const mbfl_encoding mbfl_encoding_2022kr = {
&vtbl_wchar_2022kr,
mb_iso2022kr_to_wchar,
mb_wchar_to_iso2022kr,
NULL,
NULL
};

Expand Down Expand Up @@ -7832,6 +7841,7 @@ const mbfl_encoding mbfl_encoding_sjis = {
&vtbl_wchar_sjis,
mb_sjis_to_wchar,
mb_wchar_to_sjis,
NULL,
NULL
};

Expand Down Expand Up @@ -7868,6 +7878,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
&vtbl_wchar_sjis_mac,
mb_sjismac_to_wchar,
mb_wchar_to_sjismac,
NULL,
NULL
};

Expand Down Expand Up @@ -7906,6 +7917,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
&vtbl_wchar_sjis_docomo,
mb_sjis_docomo_to_wchar,
mb_wchar_to_sjis_docomo,
NULL,
NULL
};

Expand Down Expand Up @@ -7940,6 +7952,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
&vtbl_wchar_sjis_kddi,
mb_sjis_kddi_to_wchar,
mb_wchar_to_sjis_kddi,
NULL,
NULL
};

Expand Down Expand Up @@ -7974,6 +7987,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
&vtbl_wchar_sjis_sb,
mb_sjis_sb_to_wchar,
mb_wchar_to_sjis_sb,
NULL,
NULL
};

Expand Down Expand Up @@ -8017,6 +8031,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
&vtbl_wchar_sjis2004,
mb_sjis2004_to_wchar,
mb_wchar_to_sjis2004,
NULL,
NULL
};

Expand Down Expand Up @@ -8103,6 +8118,7 @@ const mbfl_encoding mbfl_encoding_cp932 = {
&vtbl_wchar_cp932,
mb_cp932_to_wchar,
mb_wchar_to_cp932,
NULL,
NULL
};

Expand Down Expand Up @@ -8137,6 +8153,7 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
&vtbl_wchar_sjiswin,
mb_cp932_to_wchar,
mb_wchar_to_sjiswin,
NULL,
NULL
};

Expand Down Expand Up @@ -10346,6 +10363,7 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
&vtbl_wchar_eucjp,
mb_eucjp_to_wchar,
mb_wchar_to_eucjp,
NULL,
NULL
};

Expand Down Expand Up @@ -10382,6 +10400,7 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
&vtbl_wchar_eucjp2004,
mb_eucjp2004_to_wchar,
mb_wchar_to_eucjp2004,
NULL,
NULL
};

Expand Down Expand Up @@ -10418,6 +10437,7 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
&vtbl_wchar_eucjpwin,
mb_eucjpwin_to_wchar,
mb_wchar_to_eucjpwin,
NULL,
NULL
};

Expand Down Expand Up @@ -10454,6 +10474,7 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
&vtbl_wchar_cp51932,
mb_cp51932_to_wchar,
mb_wchar_to_cp51932,
NULL,
NULL
};

Expand Down Expand Up @@ -10509,6 +10530,7 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
&vtbl_wchar_euccn,
mb_euccn_to_wchar,
mb_wchar_to_euccn,
NULL,
NULL
};

Expand Down Expand Up @@ -10545,6 +10567,7 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
&vtbl_wchar_euctw,
mb_euctw_to_wchar,
mb_wchar_to_euctw,
NULL,
NULL
};

Expand Down Expand Up @@ -10581,6 +10604,7 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
&vtbl_wchar_euckr,
mb_euckr_to_wchar,
mb_wchar_to_euckr,
NULL,
NULL
};

Expand Down Expand Up @@ -10640,6 +10664,7 @@ const mbfl_encoding mbfl_encoding_uhc = {
&vtbl_wchar_uhc,
mb_uhc_to_wchar,
mb_wchar_to_uhc,
NULL,
NULL
};

Expand Down Expand Up @@ -11555,6 +11580,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
&vtbl_wchar_gb18030,
mb_gb18030_to_wchar,
mb_wchar_to_gb18030,
NULL,
NULL
};

Expand Down Expand Up @@ -11591,6 +11617,7 @@ const mbfl_encoding mbfl_encoding_cp936 = {
&vtbl_wchar_cp936,
mb_cp936_to_wchar,
mb_wchar_to_cp936,
NULL,
NULL
};

Expand Down Expand Up @@ -12160,6 +12187,7 @@ const mbfl_encoding mbfl_encoding_big5 = {
&vtbl_wchar_big5,
mb_big5_to_wchar,
mb_wchar_to_big5,
NULL,
NULL
};

Expand Down Expand Up @@ -12194,6 +12222,7 @@ const mbfl_encoding mbfl_encoding_cp950 = {
&vtbl_wchar_cp950,
mb_cp950_to_wchar,
mb_wchar_to_cp950,
NULL,
NULL
};

Expand Down Expand Up @@ -12567,5 +12596,6 @@ const mbfl_encoding mbfl_encoding_hz = {
&vtbl_wchar_hz,
mb_hz_to_wchar,
mb_wchar_to_hz,
NULL,
NULL
};
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_htmlent.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ const mbfl_encoding mbfl_encoding_html_ent = {
&vtbl_wchar_html,
mb_htmlent_to_wchar,
mb_wchar_to_htmlent,
NULL,
NULL
};

Expand Down
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_qprint.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ const mbfl_encoding mbfl_encoding_qprint = {
NULL,
mb_qprint_to_wchar,
mb_wchar_to_qprint,
NULL,
NULL
};

Expand Down
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
&vtbl_wchar_##id, \
mb_##id##_to_wchar, \
mb_wchar_to_##id, \
NULL, \
NULL \
}

Expand Down
3 changes: 3 additions & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_ucs2.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
&vtbl_wchar_ucs2,
mb_ucs2_to_wchar,
mb_wchar_to_ucs2be,
NULL,
NULL
};

Expand All @@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
&vtbl_wchar_ucs2be,
mb_ucs2be_to_wchar,
mb_wchar_to_ucs2be,
NULL,
NULL
};

Expand All @@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
&vtbl_wchar_ucs2le,
mb_ucs2le_to_wchar,
mb_wchar_to_ucs2le,
NULL,
NULL
};

Expand Down
3 changes: 3 additions & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_ucs4.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
&vtbl_wchar_ucs4,
mb_ucs4_to_wchar,
mb_wchar_to_ucs4be,
NULL,
NULL
};

Expand All @@ -71,6 +72,7 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
&vtbl_wchar_ucs4be,
mb_ucs4be_to_wchar,
mb_wchar_to_ucs4be,
NULL,
NULL
};

Expand All @@ -85,6 +87,7 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
&vtbl_wchar_ucs4le,
mb_ucs4le_to_wchar,
mb_wchar_to_ucs4le,
NULL,
NULL
};

Expand Down
3 changes: 3 additions & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_utf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ const mbfl_encoding mbfl_encoding_utf16 = {
&vtbl_wchar_utf16,
mb_utf16_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL
};

Expand All @@ -203,6 +204,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
&vtbl_wchar_utf16be,
mb_utf16be_to_wchar,
mb_wchar_to_utf16be,
NULL,
NULL
};

Expand All @@ -217,6 +219,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
&vtbl_wchar_utf16le,
mb_utf16le_to_wchar,
mb_wchar_to_utf16le,
NULL,
NULL
};

Expand Down
3 changes: 3 additions & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_utf32.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const mbfl_encoding mbfl_encoding_utf32 = {
&vtbl_wchar_utf32,
mb_utf32_to_wchar,
mb_wchar_to_utf32be,
NULL,
NULL
};

Expand All @@ -64,6 +65,7 @@ const mbfl_encoding mbfl_encoding_utf32be = {
&vtbl_wchar_utf32be,
mb_utf32be_to_wchar,
mb_wchar_to_utf32be,
NULL,
NULL
};

Expand All @@ -78,6 +80,7 @@ const mbfl_encoding mbfl_encoding_utf32le = {
&vtbl_wchar_utf32le,
mb_utf32le_to_wchar,
mb_wchar_to_utf32le,
NULL,
NULL
};

Expand Down
Loading