diff --git a/Zend/Optimizer/zend_func_infos.h b/Zend/Optimizer/zend_func_infos.h index a3475fab6cc8f..34b8b9c4cbbf6 100644 --- a/Zend/Optimizer/zend_func_infos.h +++ b/Zend/Optimizer/zend_func_infos.h @@ -227,7 +227,7 @@ static const func_info_t func_infos[] = { F1("mb_strtoupper", MAY_BE_STRING), F1("mb_strtolower", MAY_BE_STRING), F1("mb_detect_encoding", MAY_BE_STRING|MAY_BE_FALSE), - F1("mb_list_encodings", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING), + FN("mb_list_encodings", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING), F1("mb_encoding_aliases", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING), F1("mb_encode_mimeheader", MAY_BE_STRING), F1("mb_decode_mimeheader", MAY_BE_STRING), diff --git a/ext/mbstring/mb_gpc.c b/ext/mbstring/mb_gpc.c index 1ffe0af56bc7e..33fbd32edc663 100644 --- a/ext/mbstring/mb_gpc.c +++ b/ext/mbstring/mb_gpc.c @@ -234,7 +234,7 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i } else if (info->num_from_encodings == 1) { from_encoding = info->from_encodings[0]; } else { - from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection)); + from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection), false); if (!from_encoding) { if (info->report_errors) { php_error_docref(NULL, E_WARNING, "Unable to detect encoding"); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 73dea4e5c6b81..356f25f0a41d3 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -90,7 +90,7 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc); static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding); -static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict); +static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant); static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent); @@ -452,7 +452,7 @@ static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *a list_size = MBSTRG(current_detect_order_list_size); } - return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false); + return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false, false); } static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from) @@ -1016,6 +1016,7 @@ ZEND_TSRMLS_CACHE_UPDATE(); mbstring_globals->internal_encoding_set = 0; mbstring_globals->http_output_set = 0; mbstring_globals->http_input_set = 0; + mbstring_globals->all_encodings_list = NULL; } /* }}} */ @@ -1156,6 +1157,13 @@ PHP_RSHUTDOWN_FUNCTION(mbstring) MBSTRG(outconv_enabled) = false; MBSTRG(outconv_state) = 0; + if (MBSTRG(all_encodings_list)) { + GC_DELREF(MBSTRG(all_encodings_list)); + zend_hash_destroy(MBSTRG(all_encodings_list)); + efree(MBSTRG(all_encodings_list)); + MBSTRG(all_encodings_list) = NULL; + } + #ifdef HAVE_MBREGEX PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU); #endif @@ -2687,7 +2695,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t leng from_encoding = *from_encodings; } else { /* auto detect */ - from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection)); + from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true); if (!from_encoding) { php_error_docref(NULL, E_WARNING, "Unable to detect character encoding"); return NULL; @@ -2988,28 +2996,38 @@ struct candidate { size_t in_len; uint64_t demerits; /* Wide bit size to prevent overflow */ unsigned int state; + float multiplier; }; -static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict) +static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant) { size_t j = 0; for (size_t i = 0; i < length; i++) { const mbfl_encoding *enc = encodings[i]; + array[j].enc = enc; + array[j].state = 0; + array[j].demerits = 0; + /* If any candidate encodings have specialized validation functions, use them * to eliminate as many candidates as possible */ - if (strict && enc->check != NULL) { + if (enc->check != NULL) { for (size_t k = 0; k < n; k++) { if (!enc->check((unsigned char*)in[k], in_len[k])) { - goto skip_to_next; + if (strict) { + goto skip_to_next; + } else { + array[j].demerits += 500; + } } } } - array[j].enc = enc; - array[j].state = 0; - array[j].demerits = 0; + /* This multiplier can optionally be used to make candidate encodings listed + * first more likely to be chosen. It is a weight factor which multiplies + * the number of demerits counted for each candidate. */ + array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0; j++; skip_to_next: ; } @@ -3085,10 +3103,14 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict } } + for (size_t i = 0; i < length; i++) { + array[i].demerits *= array[i].multiplier; + } + return length; } -MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict) +MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant) { if (elist_size == 0) { return NULL; @@ -3109,7 +3131,7 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c /* Allocate on stack; when we return, this array is automatically freed */ struct candidate *array = alloca(elist_size * sizeof(struct candidate)); - elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict); + elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant); while (n--) { start_string(array, elist_size, strings[n], str_lengths[n]); @@ -3133,9 +3155,9 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c /* When doing 'strict' detection, any string which is invalid in the candidate encoding * is rejected. With non-strict detection, we just continue, but apply demerits for * each invalid byte sequence */ -static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict) +static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant) { - return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict); + return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant); } /* {{{ Encodings of the given string is returned (as a string) */ @@ -3154,8 +3176,17 @@ PHP_FUNCTION(mb_detect_encoding) Z_PARAM_BOOL(strict) ZEND_PARSE_PARAMETERS_END(); + /* Should we pay attention to the order of the provided candidate encodings and prefer + * the earlier ones (if more than one candidate encoding matches)? + * If the entire list of supported encodings returned by `mb_list_encodings` is passed + * in, then don't treat the order as significant */ + bool order_significant = true; + /* make encoding list */ if (encoding_ht) { + if (encoding_ht == MBSTRG(all_encodings_list)) { + order_significant = false; + } if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) { RETURN_THROWS(); } @@ -3187,7 +3218,7 @@ PHP_FUNCTION(mb_detect_encoding) if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) { ret = &mbfl_encoding_utf8; } else { - ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict); + ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant); } efree(ZEND_VOIDP(elist)); @@ -3205,10 +3236,22 @@ PHP_FUNCTION(mb_list_encodings) { ZEND_PARSE_PARAMETERS_NONE(); - array_init(return_value); - for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) { - add_next_index_string(return_value, (*encodings)->name); + if (MBSTRG(all_encodings_list) == NULL) { + /* Initialize shared array of supported encoding names + * This is done so that we can check if `mb_list_encodings()` is being + * passed to other mbstring functions using a cheap pointer equality check */ + HashTable *array = emalloc(sizeof(HashTable)); + zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false); + for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) { + zval tmp; + ZVAL_STRING(&tmp, (*encodings)->name); + zend_hash_next_index_insert(array, &tmp); + } + MBSTRG(all_encodings_list) = array; } + + GC_ADDREF(MBSTRG(all_encodings_list)); + RETURN_ARR(MBSTRG(all_encodings_list)); } /* }}} */ @@ -3536,8 +3579,15 @@ PHP_FUNCTION(mb_convert_variables) from_encoding = MBSTRG(current_internal_encoding); + bool order_significant = true; + /* pre-conversion encoding */ if (from_enc_ht) { + if (from_enc_ht == MBSTRG(all_encodings_list)) { + /* If entire list of supported encodings returned by `mb_list_encodings` is passed + * in, then don't treat the order of the list as significant */ + order_significant = false; + } if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) { RETURN_THROWS(); } @@ -3575,7 +3625,7 @@ PHP_FUNCTION(mb_convert_variables) RETURN_FALSE; } } - from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection)); + from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant); efree(ZEND_VOIDP(val_list)); efree(len_list); if (!from_encoding) { @@ -4293,7 +4343,7 @@ PHP_FUNCTION(mb_send_mail) /* Subject: */ const mbfl_encoding *enc = MBSTRG(current_internal_encoding); if (enc == &mbfl_encoding_pass) { - enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection)); + enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false); } const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF; size_t line_sep_len = strlen(line_sep); @@ -4303,7 +4353,7 @@ PHP_FUNCTION(mb_send_mail) /* message body */ const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding); if (msg_enc == &mbfl_encoding_pass) { - msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection)); + msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false); } unsigned int num_errors = 0; diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 0837e45cf327a..7256605bd2462 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -67,7 +67,7 @@ MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc) MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc); MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding); -MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict); +MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant); ZEND_BEGIN_MODULE_GLOBALS(mbstring) char *internal_encoding_name; @@ -88,6 +88,7 @@ ZEND_BEGIN_MODULE_GLOBALS(mbstring) size_t current_detect_order_list_size; enum mbfl_no_encoding *default_detect_order_list; size_t default_detect_order_list_size; + HashTable *all_encodings_list; int filter_illegal_mode; uint32_t filter_illegal_substchar; int current_filter_illegal_mode; diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index 0d04e34bc7823..add0a35e05b80 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -140,7 +140,6 @@ function mb_detect_encoding(string $string, array|string|null $encodings = null, /** * @return array - * @refcount 1 */ function mb_list_encodings(): array {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index 3bf10789e0d6c..e8985793e380f 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 0c9ac8888b8332557f7098cfb9d259757af8b3c6 */ + * Stub hash: 26a027093075613056921c4d1a7eee65d52ec5eb */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt index 2930942c12c5a..9aa4eb6925463 100644 --- a/ext/mbstring/tests/gh10192_utf7.phpt +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) { --EXPECT-- non-base64 character after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -93,7 +93,7 @@ int(0) base64 character before + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -174,7 +174,7 @@ int(2) - and + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -219,7 +219,7 @@ int(2) valid direct encoding character = after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -228,7 +228,7 @@ int(2) invalid direct encoding character ~ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -237,7 +237,7 @@ int(2) invalid direct encoding character \ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -246,7 +246,7 @@ int(2) invalid direct encoding character ESC after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index 544375fbd2998..11d5a1c31364f 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF- echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n"; echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n"; +// This test case courtesy of Adrien Foulon +// It depends on the below use of '+' being recognized as invalid UTF-7 +$css = 'input[type="radio"]:checked + img { + border: 5px solid #0083ca; +}'; +echo mb_detect_encoding($css, mb_list_encodings(), true), "\n"; + echo "== DETECT ORDER ==\n"; mb_detect_order('auto'); @@ -88,6 +95,17 @@ print("EUC-JP: " . mb_detect_encoding($euc_jp) . "\n"); print("SJIS: " . mb_detect_encoding($sjis) . "\n"); +// Thanks to Ulrik Nielsen for the following tests; the hex strings are the same file, but in two +// different text encodings +// We do not have any strong hints showing that the second one is actually UTF-8... +// but mb_detect_encoding still guesses UTF-8 because it is the first one in the list + +$win1252text = hex2bin("2320546869732066696c6520636f6e7461696e732057696e646f77732d3132353220656e636f646564206461746120616e642048544d4c20656e7469746965730a61626364650ae6f8e50af00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790af00ad00a2623383739313b0a262331373238373b0a262333383937393b0a2623353437333b0a616263646520e6f8e520f020d0203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e202623383739313b20262331373238373b20262333383937393b202623353437333b0a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320a8c0a890a2320506f6c69736820737472696e670a50727a656a6426233337383b20646f2070727a65676c26233236313b64750a"); +echo mb_detect_encoding($win1252text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n"; + +$utf8text = hex2bin("2320546869732066696c6520636f6e7461696e73205554462d3820656e636f64656420646174610a61626364650ac3a6c3b8c3a50ac3b00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790ac3b00ac3900ae289970ae48e870ae9a1830ae195a10a616263646520c3a6c3b8c3a520c3b020c390203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e20e2899720e48e8720e9a18320e195a10a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320ac5920ae280b00a2320506f6c69736820737472696e670a50727a656a64c5ba20646f2070727a65676cc48564750a"); +echo mb_detect_encoding($utf8text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n"; + echo "== INVALID PARAMETER ==\n"; print("INT: " . mb_detect_encoding(1234, 'EUC-JP') . "\n"); // EUC-JP @@ -389,10 +407,13 @@ UTF-8 UTF-8 UTF-8 SJIS +UTF-8 == DETECT ORDER == JIS: JIS EUC-JP: EUC-JP SJIS: SJIS +Windows-1252 +UTF-8 == INVALID PARAMETER == INT: EUC-JP EUC-JP: EUC-JP