From d9405a9ff157f3065706a67a2cb0d2174b053ff0 Mon Sep 17 00:00:00 2001 From: pakutoma Date: Wed, 22 Mar 2023 02:09:14 +0900 Subject: [PATCH] Fix phpGH-10648: add check function pointer into mbfl_encoding Previously, mbstring used the same logic for encoding validationas for encoding conversion. However, there are cases where we want to use different logic for validation and conversion. For example, if a string ends up with missing input required by the encoding, or if a character is input that is invalid as an encoding but can be converted, the conversion should succeed and the validation should fail. To achieve this, a function pointer mb_check_fn has been added to struct mbfl_encoding to implement the logic used for validation. Also, added implementation of validation logic for UTF-7, UTF7-IMAP, ISO-2022-JP and JIS. --- NEWS | 5 + UPGRADING | 10 + ext/mbstring/libmbfl/filters/mbfilter_7bit.c | 3 +- .../libmbfl/filters/mbfilter_base64.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_big5.c | 6 +- .../libmbfl/filters/mbfilter_cp5022x.c | 9 +- .../libmbfl/filters/mbfilter_cp51932.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_cp932.c | 6 +- ext/mbstring/libmbfl/filters/mbfilter_cp936.c | 3 +- .../libmbfl/filters/mbfilter_euc_cn.c | 3 +- .../libmbfl/filters/mbfilter_euc_jp.c | 3 +- .../libmbfl/filters/mbfilter_euc_jp_win.c | 3 +- .../libmbfl/filters/mbfilter_euc_kr.c | 3 +- .../libmbfl/filters/mbfilter_euc_tw.c | 3 +- .../libmbfl/filters/mbfilter_gb18030.c | 3 +- .../libmbfl/filters/mbfilter_htmlent.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_hz.c | 3 +- .../libmbfl/filters/mbfilter_iso2022_jp_ms.c | 3 +- .../libmbfl/filters/mbfilter_iso2022_kr.c | 3 +- .../filters/mbfilter_iso2022jp_mobile.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_jis.c | 164 +++++- .../libmbfl/filters/mbfilter_qprint.c | 3 +- .../libmbfl/filters/mbfilter_singlebyte.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_sjis.c | 3 +- .../libmbfl/filters/mbfilter_sjis_2004.c | 9 +- .../libmbfl/filters/mbfilter_sjis_mac.c | 3 +- .../libmbfl/filters/mbfilter_sjis_mobile.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_ucs2.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_ucs4.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_uhc.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_utf32.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_utf7.c | 158 ++++- .../libmbfl/filters/mbfilter_utf7imap.c | 130 ++++- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 3 +- .../libmbfl/filters/mbfilter_utf8_mobile.c | 12 +- .../libmbfl/filters/mbfilter_uuencode.c | 3 +- ext/mbstring/libmbfl/filters/utf7_helper.h | 22 + ext/mbstring/libmbfl/mbfl/mbfilter.c | 10 + ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c | 3 +- ext/mbstring/libmbfl/mbfl/mbfilter_pass.c | 1 + ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c | 1 + ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 2 + ext/mbstring/mbstring.c | 4 + ext/mbstring/tests/gh10192_utf7.phpt | 542 ++++++++++++++++++ ext/mbstring/tests/gh10192_utf7imap.phpt | 423 ++++++++++++++ ext/mbstring/tests/gh10648.phpt | 155 +++++ ext/mbstring/tests/iso2022jp_encoding.phpt | 68 +-- ext/mbstring/tests/utf_encodings.phpt | 13 +- 49 files changed, 1750 insertions(+), 114 deletions(-) create mode 100644 ext/mbstring/libmbfl/filters/utf7_helper.h create mode 100644 ext/mbstring/tests/gh10192_utf7.phpt create mode 100644 ext/mbstring/tests/gh10192_utf7imap.phpt create mode 100644 ext/mbstring/tests/gh10648.phpt diff --git a/NEWS b/NEWS index 45661b5dffe18..f3e5c25ef9126 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,11 @@ PHP NEWS - MBString: . ext/mbstring: fix new_value length check. (Max Kellermann) . Fix bug GH-10627 (mb_convert_encoding crashes PHP on Windows). (nielsdos) + . Fix bug GH-10192 (mb_detect_encoding() results for UTF-7 differ between + PHP 8.0 and 8.1 (if UTF-7 is present in the encodings list and the string + contains '+' character)). (pakutoma) + . Fix bug GH-10648 (mb_check_encoding() returns true for incorrect but + interpretable ISO-2022-JP byte sequences). (pakutoma) - Opcache: . Fix incorrect page_size check. (nielsdos) diff --git a/UPGRADING b/UPGRADING index 744fa57c1c8dc..05b33b231ee34 100644 --- a/UPGRADING +++ b/UPGRADING @@ -218,6 +218,16 @@ PHP 8.2 UPGRADE NOTES dba_fetch(string|array $key, $skip, $dba): string|false is still accepted, but it is recommended to use the new standard variant. +- MBString + . mb_check_encoding() now checks input encoding more strictly. + . mb_detect_encoding() now checks input encoding more strictly + when strict detection is enabled. + . mb_convert_encoding() checks the input encoding more strictly + if multiple encodings are passed to from_encoding + and the mbstring.strict_detection INI directive is set to 1. + This change only affects the encoding selection, + not the result of the conversion. + - Random . random_bytes() and random_int() now throw \Random\RandomException on CSPRNG failure. Previously a plain \Exception was thrown. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index 208792720af46..fd62602ab17b6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_7bit = { &vtbl_7bit_wchar, &vtbl_wchar_7bit, mb_7bit_to_wchar, - mb_wchar_to_7bit + mb_wchar_to_7bit, + NULL }; #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index ede3eef18ce7c..162e9b1bda87d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_base64 = { NULL, NULL, mb_base64_to_wchar, - mb_wchar_to_base64 + mb_wchar_to_base64, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_b64 = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_big5.c b/ext/mbstring/libmbfl/filters/mbfilter_big5.c index 58f89d1b5759e..7618130aac81e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_big5.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_big5.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_big5 = { &vtbl_big5_wchar, &vtbl_wchar_big5, mb_big5_to_wchar, - mb_wchar_to_big5 + mb_wchar_to_big5, + NULL }; const mbfl_encoding mbfl_encoding_cp950 = { @@ -82,7 +83,8 @@ const mbfl_encoding mbfl_encoding_cp950 = { &vtbl_cp950_wchar, &vtbl_wchar_cp950, mb_cp950_to_wchar, - mb_wchar_to_cp950 + mb_wchar_to_cp950, + NULL }; const struct mbfl_convert_vtbl vtbl_big5_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index 32a8bdf15f59a..93c33da9543d0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = { &vtbl_cp50220_wchar, &vtbl_wchar_cp50220, mb_cp5022x_to_wchar, - mb_wchar_to_cp50220 + mb_wchar_to_cp50220, + NULL }; const mbfl_encoding mbfl_encoding_cp50221 = { @@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = { &vtbl_cp50221_wchar, &vtbl_wchar_cp50221, mb_cp5022x_to_wchar, - mb_wchar_to_cp50221 + mb_wchar_to_cp50221, + NULL }; const mbfl_encoding mbfl_encoding_cp50222 = { @@ -87,7 +89,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = { &vtbl_cp50222_wchar, &vtbl_wchar_cp50222, mb_cp5022x_to_wchar, - mb_wchar_to_cp50222 + mb_wchar_to_cp50222, + NULL }; const struct mbfl_convert_vtbl vtbl_cp50220_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index 6311f9b72139a..d3aae8b10f56e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = { &vtbl_cp51932_wchar, &vtbl_wchar_cp51932, mb_cp51932_to_wchar, - mb_wchar_to_cp51932 + mb_wchar_to_cp51932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp51932_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index cf8e461e1d9c0..506c24393906d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -100,7 +100,8 @@ const mbfl_encoding mbfl_encoding_cp932 = { &vtbl_cp932_wchar, &vtbl_wchar_cp932, mb_cp932_to_wchar, - mb_wchar_to_cp932 + mb_wchar_to_cp932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp932_wchar = { @@ -133,7 +134,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = { &vtbl_sjiswin_wchar, &vtbl_wchar_sjiswin, mb_cp932_to_wchar, - mb_wchar_to_sjiswin + mb_wchar_to_sjiswin, + NULL }; const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c index 40ae8c86f9119..02e808ce9282c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_cp936 = { &vtbl_cp936_wchar, &vtbl_wchar_cp936, mb_cp936_to_wchar, - mb_wchar_to_cp936 + mb_wchar_to_cp936, + NULL }; const struct mbfl_convert_vtbl vtbl_cp936_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c index 50a0368a923f4..cec5f5d41d5e6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = { &vtbl_euccn_wchar, &vtbl_wchar_euccn, mb_euccn_to_wchar, - mb_wchar_to_euccn + mb_wchar_to_euccn, + NULL }; const struct mbfl_convert_vtbl vtbl_euccn_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c index 2b0ae77534d56..aa5f323db6f0a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = { &vtbl_eucjp_wchar, &vtbl_wchar_eucjp, mb_eucjp_to_wchar, - mb_wchar_to_eucjp + mb_wchar_to_eucjp, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c index 09287a9d8f634..d35cec9541093 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { &vtbl_eucjpwin_wchar, &vtbl_wchar_eucjpwin, mb_eucjpwin_to_wchar, - mb_wchar_to_eucjpwin + mb_wchar_to_eucjpwin, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c index 69e6811922e30..b0cb1954739c9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = { &vtbl_euckr_wchar, &vtbl_wchar_euckr, mb_euckr_to_wchar, - mb_wchar_to_euckr + mb_wchar_to_euckr, + NULL }; const struct mbfl_convert_vtbl vtbl_euckr_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c index de1deb47705f1..522f5f4a05a5b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = { &vtbl_euctw_wchar, &vtbl_wchar_euctw, mb_euctw_to_wchar, - mb_wchar_to_euctw + mb_wchar_to_euctw, + NULL }; const struct mbfl_convert_vtbl vtbl_euctw_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c index 492df6046244f..d607aafef49e4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = { &vtbl_gb18030_wchar, &vtbl_wchar_gb18030, mb_gb18030_to_wchar, - mb_wchar_to_gb18030 + mb_wchar_to_gb18030, + NULL }; const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index afebdfd00811f..a75a9c757cb83 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_html_ent = { &vtbl_html_wchar, &vtbl_wchar_html, mb_htmlent_to_wchar, - mb_wchar_to_htmlent + mb_wchar_to_htmlent, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_html = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_hz.c b/ext/mbstring/libmbfl/filters/mbfilter_hz.c index 72e5963acfc18..b047bfc8b7b27 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_hz.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_hz.c @@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_hz = { &vtbl_hz_wchar, &vtbl_wchar_hz, mb_hz_to_wchar, - mb_wchar_to_hz + mb_wchar_to_hz, + NULL }; const struct mbfl_convert_vtbl vtbl_hz_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c index 65b6d66d2ec2e..e3676d30e2904 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = { &vtbl_2022jpms_wchar, &vtbl_wchar_2022jpms, mb_iso2022jpms_to_wchar, - mb_wchar_to_iso2022jpms + mb_wchar_to_iso2022jpms, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index c4b2bf0b9f1b9..d51fd720e9704 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_2022kr = { &vtbl_2022kr_wchar, &vtbl_wchar_2022kr, mb_iso2022kr_to_wchar, - mb_wchar_to_iso2022kr + mb_wchar_to_iso2022kr, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_2022kr = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c index ded492cafc13a..91ed4e1e84a21 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { &vtbl_2022jp_kddi_wchar, &vtbl_wchar_2022jp_kddi, mb_iso2022jp_kddi_to_wchar, - mb_wchar_to_iso2022jp_kddi + mb_wchar_to_iso2022jp_kddi, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_jis.c b/ext/mbstring/libmbfl/filters/mbfilter_jis.c index fc5f18aeb5d1c..80af0e695644c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c @@ -37,6 +37,8 @@ static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter); static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len); +static bool mb_check_jis(unsigned char *in, size_t in_len); const mbfl_encoding mbfl_encoding_jis = { mbfl_no_encoding_jis, @@ -49,6 +51,7 @@ const mbfl_encoding mbfl_encoding_jis = { &vtbl_wchar_jis, mb_iso2022jp_to_wchar, mb_wchar_to_jis, + mb_check_jis }; const mbfl_encoding mbfl_encoding_2022jp = { @@ -61,7 +64,8 @@ const mbfl_encoding mbfl_encoding_2022jp = { &vtbl_2022jp_wchar, &vtbl_wchar_2022jp, mb_iso2022jp_to_wchar, - mb_wchar_to_iso2022jp + mb_wchar_to_iso2022jp, + mb_check_iso2022jp }; const struct mbfl_convert_vtbl vtbl_jis_wchar = { @@ -780,3 +784,161 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +#define JISX_0201_KANA_SO 5 + +static bool mb_check_jis(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if (state == JISX_0201_KANA_SO) { + return false; + } + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else if (c3 == '(') { + if (p == e) { + return false; + } + unsigned char c4 = *p++; + if (c4 == '@' || c4 == 'B') { + state = JISX_0208; + } else if (c4 == 'D') { + state = JISX_0212; + } else { + return false; + } + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons. + * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */ + if (c3 == 'B' || c3 == 'H') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else if (c3 == 'I') { + state = JISX_0201_KANA; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE) { + /* "Kana In" marker */ + if (state != ASCII) { + return false; + } + state = JISX_0201_KANA_SO; + } else if (c == 0xF) { + /* "Kana Out" marker */ + if (state != JISX_0201_KANA_SO) { + return false; + } + state = ASCII; + } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (state == JISX_0208) { + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + } else { + if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) { + continue; + } + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else if (c >= 0xA1 && c <= 0xDF) { + /* GR-invoked Kana */ + continue; + } else { + return false; + } + } + + return state == ASCII; +} + + +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + if (c3 == 'B') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE || c == 0xF) { + /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */ + return false; + } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else { + return false; + } + } + + return state == ASCII; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 5fde30ee80935..2bcddedede337 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = { NULL, NULL, mb_qprint_to_wchar, - mb_wchar_to_qprint + mb_wchar_to_qprint, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_qprint = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index 56c9b2dbc85d9..c5872335a8526 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -86,7 +86,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int &vtbl_##id##_wchar, \ &vtbl_wchar_##id, \ mb_##id##_to_wchar, \ - mb_wchar_to_##id \ + mb_wchar_to_##id, \ + NULL \ } /* For single-byte encodings which use a conversion table */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index f23a8b08aceab..59399bf7217f0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis = { &vtbl_sjis_wchar, &vtbl_wchar_sjis, mb_sjis_to_wchar, - mb_wchar_to_sjis + mb_wchar_to_sjis, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 737871eda8a31..bc4d932187061 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { &vtbl_sjis2004_wchar, &vtbl_wchar_sjis2004, mb_sjis2004_to_wchar, - mb_wchar_to_sjis2004 + mb_wchar_to_sjis2004, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { @@ -100,7 +101,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { &vtbl_eucjp2004_wchar, &vtbl_wchar_eucjp2004, mb_eucjp2004_to_wchar, - mb_wchar_to_eucjp2004 + mb_wchar_to_eucjp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { @@ -133,7 +135,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = { &vtbl_2022jp_2004_wchar, &vtbl_wchar_2022jp_2004, mb_iso2022jp2004_to_wchar, - mb_wchar_to_iso2022jp2004 + mb_wchar_to_iso2022jp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 0ff2a198d36c8..8fb569b36c483 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, mb_sjismac_to_wchar, - mb_wchar_to_sjismac + mb_wchar_to_sjismac, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index 448e0a74ca3aa..7e8f412230b5a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -78,7 +78,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { &vtbl_sjis_docomo_wchar, &vtbl_wchar_sjis_docomo, mb_sjis_docomo_to_wchar, - mb_wchar_to_sjis_docomo + mb_wchar_to_sjis_docomo, + NULL }; const mbfl_encoding mbfl_encoding_sjis_kddi = { @@ -91,7 +92,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { &vtbl_sjis_kddi_wchar, &vtbl_wchar_sjis_kddi, mb_sjis_kddi_to_wchar, - mb_wchar_to_sjis_kddi + mb_wchar_to_sjis_kddi, + NULL }; const mbfl_encoding mbfl_encoding_sjis_sb = { @@ -104,7 +106,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { &vtbl_sjis_sb_wchar, &vtbl_wchar_sjis_sb, mb_sjis_sb_to_wchar, - mb_wchar_to_sjis_sb + mb_wchar_to_sjis_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index 3e0d0828cfa62..e6711d82f8a70 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = { &vtbl_ucs2_wchar, &vtbl_wchar_ucs2, mb_ucs2_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = { &vtbl_ucs2be_wchar, &vtbl_wchar_ucs2be, mb_ucs2be_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = { &vtbl_ucs2le_wchar, &vtbl_wchar_ucs2le, mb_ucs2le_to_wchar, - mb_wchar_to_ucs2le + mb_wchar_to_ucs2le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs2_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index 90312b8d501d5..410be0ace74f5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = { &vtbl_ucs4_wchar, &vtbl_wchar_ucs4, mb_ucs4_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = { &vtbl_ucs4be_wchar, &vtbl_wchar_ucs4be, mb_ucs4be_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = { &vtbl_ucs4le_wchar, &vtbl_wchar_ucs4le, mb_ucs4le_to_wchar, - mb_wchar_to_ucs4le + mb_wchar_to_ucs4le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs4_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c index 2ac351d644cdb..644e0b063d9b9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_uhc = { &vtbl_uhc_wchar, &vtbl_wchar_uhc, mb_uhc_to_wchar, - mb_wchar_to_uhc + mb_wchar_to_uhc, + NULL }; const struct mbfl_convert_vtbl vtbl_uhc_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index eddd56f362756..2a7d98721df79 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf16 = { &vtbl_utf16_wchar, &vtbl_wchar_utf16, mb_utf16_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf16be = { &vtbl_utf16be_wchar, &vtbl_wchar_utf16be, mb_utf16be_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf16le = { &vtbl_utf16le_wchar, &vtbl_wchar_utf16le, mb_utf16le_to_wchar, - mb_wchar_to_utf16le + mb_wchar_to_utf16le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf16_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index e8cd4ad454f2e..58551c8b3932d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf32 = { &vtbl_utf32_wchar, &vtbl_wchar_utf32, mb_utf32_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf32be = { &vtbl_utf32be_wchar, &vtbl_wchar_utf32be, mb_utf32be_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf32le = { &vtbl_utf32le_wchar, &vtbl_wchar_utf32le, mb_utf32le_to_wchar, - mb_wchar_to_utf32le + mb_wchar_to_utf32le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf32_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index f5fe261f69d00..57641a4bbe41d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -29,10 +29,12 @@ #include "mbfilter.h" #include "mbfilter_utf7.h" +#include "utf7_helper.h" static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7(unsigned char *in, size_t in_len); static const unsigned char mbfl_base64_table[] = { /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */ @@ -59,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf7 = { &vtbl_utf7_wchar, &vtbl_wchar_utf7, mb_utf7_to_wchar, - mb_wchar_to_utf7 + mb_wchar_to_utf7, + mb_check_utf7 }; const struct mbfl_convert_vtbl vtbl_utf7_wchar = { @@ -408,16 +411,24 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFD -#define ASCII 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; } +static bool is_optional_direct(unsigned char c) +{ + /* Characters that are allowed to be encoded by Base64 or directly encoded */ + return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' || + c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' || + c == '|' || c == '}'; +} + +static bool can_end_base64(uint32_t c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; +} + static unsigned char decode_base64(unsigned char c) { if (c >= 'A' && c <= 'Z') { @@ -432,6 +443,8 @@ static unsigned char decode_base64(unsigned char c) return 63; } else if (c == '-') { return DASH; + } else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') { + return DIRECT; } else if (c <= 0x7F) { return ASCII; } @@ -470,7 +483,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t if (n == ILLEGAL) { *out++ = MBFL_BAD_INPUT; - } else if (n == ASCII) { + } else if (n == DIRECT || n == ASCII) { (*p)--; /* Unconsume byte */ } @@ -596,11 +609,6 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf return out - buf; } -static bool can_end_base64(uint32_t c) -{ - return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; -} - static bool should_direct_encode(uint32_t c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c); @@ -700,3 +708,129 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else { + /* 2nd part of surrogate pair came unexpectedly */ + return !(cp >= 0xDC00 && cp <= 0xDFFF); + } +} + +static bool can_encode_directly(unsigned char c) +{ + return should_direct_encode(c) || is_optional_direct(c) || c == '\0'; +} + +static bool mb_check_utf7(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + /* It is an error if trailing padding bits are not zeroes or if we were + * expecting the 2nd part of a surrogate pair when Base64 section ends */ + return !((n3 & 0x3) || is_surrogate); + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return !((n6 & 0xF) || is_surrogate); + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '+') { + if (p == e) { + base64 = true; + return !is_surrogate; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n > DASH) { + /* If a "+" character followed immediately by any character other than base64 or "-" */ + return false; + } else { + base64 = true; + } + } else if (can_encode_directly(c)) { + continue; + } else { + return false; + } + } + } + return !is_surrogate; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index 850edfbd63a1e..77b65bbeee8a4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -77,11 +77,13 @@ #include "mbfilter.h" #include "mbfilter_utf7imap.h" +#include "utf7_helper.h" static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7imap(unsigned char *in, size_t in_len); static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL}; @@ -95,7 +97,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { &vtbl_utf7imap_wchar, &vtbl_wchar_utf7imap, mb_utf7imap_to_wchar, - mb_wchar_to_utf7imap + mb_wchar_to_utf7imap, + mb_check_utf7imap }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { @@ -444,10 +447,6 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; @@ -732,3 +731,124 @@ static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else if (cp >= 0xDC00 && cp <= 0xDFFF) { + /* 2nd part of surrogate pair came unexpectedly */ + return false; + } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') { + return false; + } + return true; +} + +static bool mb_check_utf7imap(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + /* Base64 section */ + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '&') { + if (p == e) { + return false; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n == ILLEGAL) { + return false; + } else { + base64 = true; + } + } else if (c >= 0x20 && c <= 0x7E) { + continue; + } else { + return false; + } + } + } + return !base64; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 6c7bad0e805ac..44df3ab4257cd 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_utf8 = { &vtbl_utf8_wchar, &vtbl_wchar_utf8, mb_utf8_to_wchar, - mb_wchar_to_utf8 + mb_wchar_to_utf8, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index c573ec70f3bc9..59e2676208b90 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { &vtbl_utf8_docomo_wchar, &vtbl_wchar_utf8_docomo, mb_utf8_docomo_to_wchar, - mb_wchar_to_utf8_docomo + mb_wchar_to_utf8_docomo, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_a = { @@ -76,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { &vtbl_utf8_kddi_a_wchar, &vtbl_wchar_utf8_kddi_a, mb_utf8_kddi_a_to_wchar, - mb_wchar_to_utf8_kddi_a + mb_wchar_to_utf8_kddi_a, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_b = { @@ -89,7 +91,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { &vtbl_utf8_kddi_b_wchar, &vtbl_wchar_utf8_kddi_b, mb_utf8_kddi_b_to_wchar, - mb_wchar_to_utf8_kddi_b + mb_wchar_to_utf8_kddi_b, + NULL }; const mbfl_encoding mbfl_encoding_utf8_sb = { @@ -102,7 +105,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { &vtbl_utf8_sb_wchar, &vtbl_wchar_utf8_sb, mb_utf8_sb_to_wchar, - mb_wchar_to_utf8_sb + mb_wchar_to_utf8_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index cc90997c2fcae..83a56977d3e0e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_uuencode = { NULL, NULL, mb_uuencode_to_wchar, - mb_wchar_to_uuencode + mb_wchar_to_uuencode, + NULL }; const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { diff --git a/ext/mbstring/libmbfl/filters/utf7_helper.h b/ext/mbstring/libmbfl/filters/utf7_helper.h new file mode 100644 index 0000000000000..0e71a5a449031 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/utf7_helper.h @@ -0,0 +1,22 @@ +#ifndef MBFL_UTF7_HELPER_H +#define MBFL_UTF7_HELPER_H + +#include "mbfilter.h" + +/* Ways which a Base64-encoded section can end: */ +#define DASH 0xFC +#define DIRECT 0xFD +#define ASCII 0xFE +#define ILLEGAL 0xFF + +static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate) +{ + return !(gap || is_surrogate || n == ASCII || n == ILLEGAL); +} + +static inline bool has_surrogate(uint16_t cp, bool is_surrogate) +{ + return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF; +} + +#endif /* MBFL_UTF7_HELPER_H */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index e0cfa13e0b4f6..5f5ce07ce6f66 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -312,6 +312,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str unsigned char *p = string->val; int bad = 0; + if (identd->strict) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + mbfl_encoding_detector_data *data = &identd->filter_data[i]; + if (filter->from->check != NULL && !(filter->from->check)(p, n)) { + data->num_illegalchars++; + } + } + } + while (n--) { for (int i = 0; i < num; i++) { mbfl_convert_filter *filter = identd->filter_list[i]; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 8fe51c9fd4cbb..43db2f7f5b20b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_8bit = { &vtbl_8bit_wchar, &vtbl_wchar_8bit, mb_8bit_to_wchar, - mb_wchar_to_8bit + mb_wchar_to_8bit, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_wchar = { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c index 3fb7e991141cd..b932603e1c5f4 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c @@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_pass = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c index 5472d792a83cb..2bd9cca7b5b2a 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c @@ -42,5 +42,6 @@ const mbfl_encoding mbfl_encoding_wchar = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index e5ae285098ea0..f66e85acd8a81 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -143,6 +143,7 @@ typedef struct { typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state); typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end); +typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len); /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`, * the buffer must be at least this size (to work with all supported text encodings) */ @@ -232,6 +233,7 @@ typedef struct { const struct mbfl_convert_vtbl *output_filter; mb_to_wchar_fn to_wchar; mb_from_wchar_fn from_wchar; + mb_check_fn check; } mbfl_encoding; MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index a9ffbef13317b..f53031aaf9157 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -4412,6 +4412,10 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m unsigned char *in = (unsigned char*)input; unsigned int state = 0; + if (encoding->check != NULL) { + return encoding->check(in, length); + } + /* If the input string is not encoded in the given encoding, there is a significant chance * that this will be seen in the first bytes. Therefore, rather than converting an entire * buffer of 128 codepoints, convert and check just a few codepoints first */ diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt new file mode 100644 index 0000000000000..2930942c12c5a --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -0,0 +1,542 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A + B', + 'non-base64 character after -' => 'A - B', + 'base64 character before +' => 'A 1+ B', + 'base64 character before -' => 'A 1- B', + 'base64 character after +' => 'A +1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after +' => 'A 1+1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with +' => 'A +', + 'string ends with -' => 'A -', + '+ and -' => 'A +- B', + '- and +' => 'A -+ B', + 'valid direct encoding character =' => 'A = B', + 'invalid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character \\' => 'A \\ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character = after +' => 'A += B', + 'invalid direct encoding character ~ after +' => 'A +~ B', + 'invalid direct encoding character \\ after +' => 'A +\\ B', + 'invalid direct encoding character ESC after +' => "A +\x1b B", + 'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B', + 'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B', + 'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B', + 'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B', + 'valid base64 character between + and end of string' => 'A +ZeVnLIqe', + 'invalid base64 character between + and end of string' => 'A +ZeVnLIq', + 'valid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'valid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B', + 'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B', + 'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ', + 'first 16 bits of base64 character using surrogate pair between + and end of string' => 'A +2Gc', + 'invalid base64 character using surrogate pair in reverse order between + and -' => 'A +3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between + and -' => 'A +3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j0 B', + 'invalid base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j01 B', + 'invalid base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false)); + var_dump(mb_detect_encoding($case, 'UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-7', false)); + var_dump(mb_check_encoding($case, 'UTF-7')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +non-base64 character after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A B" +int(0) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A - B" +int(0) + +base64 character before + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A 1 B" +int(0) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 1- B" +int(0) + +base64 character after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(1) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A -1 B" +int(1) + +base64 character before and after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A 1? B" +int(2) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(7) "A 1-1 B" +int(2) + +string ends with + +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(2) "A " +int(2) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(3) "A -" +int(2) + ++ and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A + B" +int(2) + +- and + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A - B" +int(2) + +valid direct encoding character = +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid direct encoding character = after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid base64 character between + and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(2) + +invalid base64 character between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(3) + +valid base64 character between + and non-base64 character +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(3) + +invalid base64 character between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(4) + +valid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A 日本語? B" +int(5) + +invalid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本誵 B" +int(5) + +valid base64 character between + and end of string +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A 日本語" +int(5) + +invalid base64 character between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 日本?" +int(6) + +valid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(7) + +invalid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(8) + +valid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +invalid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +valid base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(8) + +first 16 bits of base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(9) + +first 16 bits of base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(10) + +valid base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 𩸽? B" +int(11) + +first 16 bits of base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(12) + +valid base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 𩸽" +int(12) + +first 16 bits of base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(13) + +invalid base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(15) + +last 16 bits of base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(16) + +invalid base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(18) + +last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(19) + +invalid base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(21) + +last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(23) + +invalid base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A ??" +int(25) + +last 16 bits of base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(26) diff --git a/ext/mbstring/tests/gh10192_utf7imap.phpt b/ext/mbstring/tests/gh10192_utf7imap.phpt new file mode 100644 index 0000000000000..c4f50884f6daf --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7imap.phpt @@ -0,0 +1,423 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A & B', + 'non-base64 character after -' => 'A - B', + 'base64 character before &' => 'A 1& B', + 'base64 character before -' => 'A 1- B', + 'base64 character after &' => 'A &1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after &' => 'A 1&1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with &' => 'A &', + 'string ends with -' => 'A -', + '& and -' => 'A &- B', + '- and &' => 'A -& B', + 'valid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character ~ after &' => 'A &~ B', + 'invalid direct encoding character ESC after &' => "A &\x1b B", + 'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B', + 'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B', + 'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B', + 'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B', + 'valid base64 character between & and end of string' => 'A &ZeVnLIqe', + 'invalid base64 character between & and end of string' => 'A &ZeVnLIq', + 'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B', + 'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B', + 'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ', + 'first 16 bits of base64 character using surrogate pair between & and end of string' => 'A &2Gc', + 'invalid base64 character using surrogate pair in reverse order between & and -' => 'A &3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between & and -' => 'A &3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j0 B', + 'invalid base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j01 B', + 'invalid base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false)); + var_dump(mb_check_encoding($case, 'UTF7-IMAP')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} + +?> +--EXPECT-- +non-base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(1) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A - B" +int(1) + +base64 character before & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(2) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A 1- B" +int(2) + +base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(3) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A -1 B" +int(3) + +base64 character before and after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(4) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(7) "A 1-1 B" +int(4) + +string ends with & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(3) "A ?" +int(5) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(3) "A -" +int(5) + +& and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A & B" +int(5) + +- and & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A -?B" +int(6) + +valid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A ~ B" +int(6) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(7) + +valid direct encoding character ~ after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(8) + +invalid direct encoding character ESC after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character between & and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(13) "A 日本語 B" +int(9) + +invalid base64 character between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(11) "A 日本? B" +int(10) + +valid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(11) + +invalid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(10) "A 日本?B" +int(12) + +valid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(13) + +invalid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本誵?B" +int(14) + +valid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(12) "A 日本語?" +int(15) + +invalid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(9) "A 日本?" +int(16) + +valid base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(8) "A 𩸽 B" +int(16) + +first 16 bits of base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(17) + +valid base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(18) + +first 16 bits of base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(19) + +valid base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(20) + +first 16 bits of base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(21) + +valid base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(7) "A 𩸽?" +int(22) + +first 16 bits of base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(24) + +invalid base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(6) "A ?? B" +int(26) + +last 16 bits of base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(27) + +invalid base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(29) + +last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(31) + +invalid base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(33) + +last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(35) + +invalid base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ???" +int(38) + +last 16 bits of base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(40) diff --git a/ext/mbstring/tests/gh10648.phpt b/ext/mbstring/tests/gh10648.phpt new file mode 100644 index 0000000000000..9f0b4b4db153a --- /dev/null +++ b/ext/mbstring/tests/gh10648.phpt @@ -0,0 +1,155 @@ +--TEST-- +GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences) +--EXTENSIONS-- +mbstring +--FILE-- + '1b244224221b2842', // 'あ' in ISO-2022-JP + 'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS + 'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS + 'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS + 'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208 + 'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212 + 'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213 + 'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII + 'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + echo 'JIS:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'JIS')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo 'ISO-2022-JP:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +ISO-2022-JP bytes +JIS: +bool(true) +あ +int(0) +ISO-2022-JP: +bool(true) +あ +int(0) + +ISO-2022-JP bytes without escape sequence +JIS: +bool(false) +あ +int(0) +ISO-2022-JP: +bool(false) +あ +int(0) + +JIS X 0201 7bit kana with escape sequence +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO/SI +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 8bit kana +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO and ESC +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with ESC and SI +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0208 character +JIS: +bool(true) +鯛 +int(0) +ISO-2022-JP: +bool(true) +鯛 +int(0) + +JIS X 0212 character +JIS: +bool(true) +鮋 +int(0) +ISO-2022-JP: +bool(false) +鮋 +int(0) + +JIS X 0213 character +JIS: +bool(false) +?$(P}L +int(1) +ISO-2022-JP: +bool(false) +?$(P}L +int(2) + +JIS C 6220-1969 ESC ( H +JIS: +bool(true) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) + +SO/SI when not in ASCII mode +JIS: +bool(false) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt index 634f0976994c3..5da1899c855b1 100644 --- a/ext/mbstring/tests/iso2022jp_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_encoding.phpt @@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) { /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ if (substr($from, 0, 3) == "\x1B(B") $from = substr($from, 3, strlen($from) - 3); - /* If the string switches to a different charset, it should switch back to - * ASCII at the end */ - if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false) - $from .= "\x1B(B"; - convertValidString($to, $from, 'UTF-16BE', $encoding, false); } } @@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) { for ($i = 0; $i < 0x80; $i++) { if ($i == 0xE || $i == 0xF || $i == 0x1B) continue; - testValid(chr($i), "\x00" . chr($i), 'JIS'); - testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */ - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); - testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid(chr($i), "\x00" . chr($i), 'JIS'); + convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */ + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); + testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); } for ($i = 0x80; $i < 256; $i++) { @@ -92,27 +87,27 @@ echo "ASCII support OK\n"; foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { if (ord($jisx0201) >= 128) { $kana = chr(ord($jisx0201) - 128); - testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false); - testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */ + testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false); + testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */ testValid($jisx0201, $utf16BE, 'JIS', false); } else { - testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80"); + testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80"); } } for ($i = 0x80; $i < 256; $i++) { if ($i >= 0xA1 && $i <= 0xDF) continue; - testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS'); - testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS'); } echo "JIS X 0201 support OK\n"; /* All valid JISX0208 characters */ foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS'); - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP'); } /* All invalid 2-byte JISX0208 characters */ @@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0208Chars[$testString])) { - testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS'); - testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP'); } } } @@ -142,7 +137,7 @@ echo "JIS X 0208 support OK\n"; /* All valid JISX0212 characters */ foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { - testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false); + testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false); } /* All invalid 2-byte JISX0212 characters */ @@ -150,14 +145,14 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0212Chars[$testString])) { - testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS'); } } } /* Try truncated JISX0212 characters */ for ($i = 0x21; $i <= 0x7E; $i++) { - testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS'); } testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false); @@ -167,29 +162,36 @@ convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false); echo "JIS X 0212 support OK\n"; /* All possible escape sequences */ -$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true]; for ($i = 0; $i <= 0xFF; $i++) { for ($j = 0; $j <= 0xFF; $j++) { $escapeSequence = "\x1B" . chr($i) . chr($j); if ($escapeSequence === "\x1B\$(") continue; - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } } for ($i = 0; $i <= 0xFF; $i++) { $escapeSequence = "\x1B\$(" . chr($i); - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } /* Also try a bare ESC */ diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 2f050c657c2f3..23166142088cc 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -1011,17 +1011,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-' // (Just trying to be exhaustive here) testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); -// + section terminated by a non-Base64 ASCII character which is NOT - -for ($i = 0; $i < 128; $i++) { - if ($i >= ord('A') && $i <= ord('Z')) - continue; - if ($i >= ord('a') && $i <= ord('z')) - continue; - if ($i >= ord('0') && $i <= ord('9')) - continue; - if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~')) - continue; - $char = chr($i); +// + section terminated by a non-Base64 direct character which is NOT - +foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) { testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); }