Skip to content

Consider order of candidate encodings when guessing source encoding of input string #11239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Zend/Optimizer/zend_func_infos.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ static const func_info_t func_infos[] = {
F1("mb_strtoupper", MAY_BE_STRING),
F1("mb_strtolower", MAY_BE_STRING),
F1("mb_detect_encoding", MAY_BE_STRING|MAY_BE_FALSE),
F1("mb_list_encodings", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING),
FN("mb_list_encodings", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING),
F1("mb_encoding_aliases", MAY_BE_ARRAY|MAY_BE_ARRAY_KEY_LONG|MAY_BE_ARRAY_OF_STRING),
F1("mb_encode_mimeheader", MAY_BE_STRING),
F1("mb_decode_mimeheader", MAY_BE_STRING),
Expand Down
2 changes: 1 addition & 1 deletion ext/mbstring/mb_gpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i
} else if (info->num_from_encodings == 1) {
from_encoding = info->from_encodings[0];
} else {
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection));
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection), false);
if (!from_encoding) {
if (info->report_errors) {
php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
Expand Down
90 changes: 70 additions & 20 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);

static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);

static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);

static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);

Expand Down Expand Up @@ -452,7 +452,7 @@ static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *a
list_size = MBSTRG(current_detect_order_list_size);
}

return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false);
return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false, false);
}

static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
Expand Down Expand Up @@ -1016,6 +1016,7 @@ ZEND_TSRMLS_CACHE_UPDATE();
mbstring_globals->internal_encoding_set = 0;
mbstring_globals->http_output_set = 0;
mbstring_globals->http_input_set = 0;
mbstring_globals->all_encodings_list = NULL;
Copy link
Contributor

@youkidearitai youkidearitai May 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think ext/mbstring/mbstring.h needs to prototype declaration

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I overlooked.

}
/* }}} */

Expand Down Expand Up @@ -1156,6 +1157,13 @@ PHP_RSHUTDOWN_FUNCTION(mbstring)
MBSTRG(outconv_enabled) = false;
MBSTRG(outconv_state) = 0;

if (MBSTRG(all_encodings_list)) {
GC_DELREF(MBSTRG(all_encodings_list));
zend_hash_destroy(MBSTRG(all_encodings_list));
efree(MBSTRG(all_encodings_list));
MBSTRG(all_encodings_list) = NULL;
}

#ifdef HAVE_MBREGEX
PHP_RSHUTDOWN(mb_regex) (INIT_FUNC_ARGS_PASSTHRU);
#endif
Expand Down Expand Up @@ -2687,7 +2695,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t leng
from_encoding = *from_encodings;
} else {
/* auto detect */
from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection));
from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
if (!from_encoding) {
php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
return NULL;
Expand Down Expand Up @@ -2988,28 +2996,38 @@ struct candidate {
size_t in_len;
uint64_t demerits; /* Wide bit size to prevent overflow */
unsigned int state;
float multiplier;
};

static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict)
static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
{
size_t j = 0;

for (size_t i = 0; i < length; i++) {
const mbfl_encoding *enc = encodings[i];

array[j].enc = enc;
array[j].state = 0;
array[j].demerits = 0;

/* If any candidate encodings have specialized validation functions, use them
* to eliminate as many candidates as possible */
if (strict && enc->check != NULL) {
if (enc->check != NULL) {
for (size_t k = 0; k < n; k++) {
if (!enc->check((unsigned char*)in[k], in_len[k])) {
goto skip_to_next;
if (strict) {
goto skip_to_next;
} else {
array[j].demerits += 500;
}
}
}
}

array[j].enc = enc;
array[j].state = 0;
array[j].demerits = 0;
/* This multiplier can optionally be used to make candidate encodings listed
* first more likely to be chosen. It is a weight factor which multiplies
* the number of demerits counted for each candidate. */
array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
j++;
skip_to_next: ;
}
Expand Down Expand Up @@ -3085,10 +3103,14 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict
}
}

for (size_t i = 0; i < length; i++) {
array[i].demerits *= array[i].multiplier;
}

return length;
}

MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
{
if (elist_size == 0) {
return NULL;
Expand All @@ -3109,7 +3131,7 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c

/* Allocate on stack; when we return, this array is automatically freed */
struct candidate *array = alloca(elist_size * sizeof(struct candidate));
elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict);
elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);

while (n--) {
start_string(array, elist_size, strings[n], str_lengths[n]);
Expand All @@ -3133,9 +3155,9 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
/* When doing 'strict' detection, any string which is invalid in the candidate encoding
* is rejected. With non-strict detection, we just continue, but apply demerits for
* each invalid byte sequence */
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
{
return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict);
return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
}

/* {{{ Encodings of the given string is returned (as a string) */
Expand All @@ -3154,8 +3176,17 @@ PHP_FUNCTION(mb_detect_encoding)
Z_PARAM_BOOL(strict)
ZEND_PARSE_PARAMETERS_END();

/* Should we pay attention to the order of the provided candidate encodings and prefer
* the earlier ones (if more than one candidate encoding matches)?
* If the entire list of supported encodings returned by `mb_list_encodings` is passed
* in, then don't treat the order as significant */
bool order_significant = true;

/* make encoding list */
if (encoding_ht) {
if (encoding_ht == MBSTRG(all_encodings_list)) {
order_significant = false;
}
if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
RETURN_THROWS();
}
Expand Down Expand Up @@ -3187,7 +3218,7 @@ PHP_FUNCTION(mb_detect_encoding)
if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
ret = &mbfl_encoding_utf8;
} else {
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
}

efree(ZEND_VOIDP(elist));
Expand All @@ -3205,10 +3236,22 @@ PHP_FUNCTION(mb_list_encodings)
{
ZEND_PARSE_PARAMETERS_NONE();

array_init(return_value);
for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
add_next_index_string(return_value, (*encodings)->name);
if (MBSTRG(all_encodings_list) == NULL) {
/* Initialize shared array of supported encoding names
* This is done so that we can check if `mb_list_encodings()` is being
* passed to other mbstring functions using a cheap pointer equality check */
HashTable *array = emalloc(sizeof(HashTable));
zend_hash_init(array, 80, NULL, zval_ptr_dtor_str, false);
for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) {
zval tmp;
ZVAL_STRING(&tmp, (*encodings)->name);
zend_hash_next_index_insert(array, &tmp);
}
MBSTRG(all_encodings_list) = array;
}

GC_ADDREF(MBSTRG(all_encodings_list));
RETURN_ARR(MBSTRG(all_encodings_list));
}
/* }}} */

Expand Down Expand Up @@ -3536,8 +3579,15 @@ PHP_FUNCTION(mb_convert_variables)

from_encoding = MBSTRG(current_internal_encoding);

bool order_significant = true;

/* pre-conversion encoding */
if (from_enc_ht) {
if (from_enc_ht == MBSTRG(all_encodings_list)) {
/* If entire list of supported encodings returned by `mb_list_encodings` is passed
* in, then don't treat the order of the list as significant */
order_significant = false;
}
if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
RETURN_THROWS();
}
Expand Down Expand Up @@ -3575,7 +3625,7 @@ PHP_FUNCTION(mb_convert_variables)
RETURN_FALSE;
}
}
from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection));
from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
efree(ZEND_VOIDP(val_list));
efree(len_list);
if (!from_encoding) {
Expand Down Expand Up @@ -4293,7 +4343,7 @@ PHP_FUNCTION(mb_send_mail)
/* Subject: */
const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
if (enc == &mbfl_encoding_pass) {
enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
}
const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
size_t line_sep_len = strlen(line_sep);
Expand All @@ -4303,7 +4353,7 @@ PHP_FUNCTION(mb_send_mail)
/* message body */
const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
if (msg_enc == &mbfl_encoding_pass) {
msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
}

unsigned int num_errors = 0;
Expand Down
3 changes: 2 additions & 1 deletion ext/mbstring/mbstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc);
MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);

MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);

ZEND_BEGIN_MODULE_GLOBALS(mbstring)
char *internal_encoding_name;
Expand All @@ -88,6 +88,7 @@ ZEND_BEGIN_MODULE_GLOBALS(mbstring)
size_t current_detect_order_list_size;
enum mbfl_no_encoding *default_detect_order_list;
size_t default_detect_order_list_size;
HashTable *all_encodings_list;
int filter_illegal_mode;
uint32_t filter_illegal_substchar;
int current_filter_illegal_mode;
Expand Down
1 change: 0 additions & 1 deletion ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ function mb_detect_encoding(string $string, array|string|null $encodings = null,

/**
* @return array<int, string>
* @refcount 1
*/
function mb_list_encodings(): array {}

Expand Down
2 changes: 1 addition & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions ext/mbstring/tests/gh10192_utf7.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) {
--EXPECT--
non-base64 character after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand All @@ -93,7 +93,7 @@ int(0)

base64 character before +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand Down Expand Up @@ -174,7 +174,7 @@ int(2)

- and +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand Down Expand Up @@ -219,7 +219,7 @@ int(2)

valid direct encoding character = after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand All @@ -228,7 +228,7 @@ int(2)

invalid direct encoding character ~ after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand All @@ -237,7 +237,7 @@ int(2)

invalid direct encoding character \ after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand All @@ -246,7 +246,7 @@ int(2)

invalid direct encoding character ESC after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
Expand Down
21 changes: 21 additions & 0 deletions ext/mbstring/tests/mb_detect_encoding.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-
echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";

// This test case courtesy of Adrien Foulon
// It depends on the below use of '+' being recognized as invalid UTF-7
$css = 'input[type="radio"]:checked + img {
border: 5px solid #0083ca;
}';
echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";

echo "== DETECT ORDER ==\n";

mb_detect_order('auto');
Expand All @@ -88,6 +95,17 @@ print("EUC-JP: " . mb_detect_encoding($euc_jp) . "\n");

print("SJIS: " . mb_detect_encoding($sjis) . "\n");

// Thanks to Ulrik Nielsen for the following tests; the hex strings are the same file, but in two
// different text encodings
// We do not have any strong hints showing that the second one is actually UTF-8...
// but mb_detect_encoding still guesses UTF-8 because it is the first one in the list

$win1252text = hex2bin("2320546869732066696c6520636f6e7461696e732057696e646f77732d3132353220656e636f646564206461746120616e642048544d4c20656e7469746965730a61626364650ae6f8e50af00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790af00ad00a2623383739313b0a262331373238373b0a262333383937393b0a2623353437333b0a616263646520e6f8e520f020d0203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e202623383739313b20262331373238373b20262333383937393b202623353437333b0a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320a8c0a890a2320506f6c69736820737472696e670a50727a656a6426233337383b20646f2070727a65676c26233236313b64750a");
echo mb_detect_encoding($win1252text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n";

$utf8text = hex2bin("2320546869732066696c6520636f6e7461696e73205554462d3820656e636f64656420646174610a61626364650ac3a6c3b8c3a50ac3b00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790ac3b00ac3900ae289970ae48e870ae9a1830ae195a10a616263646520c3a6c3b8c3a520c3b020c390203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e20e2899720e48e8720e9a18320e195a10a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320ac5920ae280b00a2320506f6c69736820737472696e670a50727a656a64c5ba20646f2070727a65676cc48564750a");
echo mb_detect_encoding($utf8text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n";

echo "== INVALID PARAMETER ==\n";

print("INT: " . mb_detect_encoding(1234, 'EUC-JP') . "\n"); // EUC-JP
Expand Down Expand Up @@ -389,10 +407,13 @@ UTF-8
UTF-8
UTF-8
SJIS
UTF-8
== DETECT ORDER ==
JIS: JIS
EUC-JP: EUC-JP
SJIS: SJIS
Windows-1252
UTF-8
== INVALID PARAMETER ==
INT: EUC-JP
EUC-JP: EUC-JP
Expand Down