Skip to content

Commit 898519a

Browse files
committed
mb_decode_numericentity converts entities which immediately follow a valid/invalid entity
Thanks to Kamil Tieleka for suggesting that some of the behaviors of the legacy implementation which the new mb_decode_numericentity implementation took care to maintain were actually bugs and should be fixed. Thanks also to Trevor Rowbotham for providing a link to the HTML specification, showing how HTML numeric entities should be interpreted. mb_decode_numericentity now processes numeric entities in the following situations where the old implementation would not: - &<ENTITY> (for example, &&php#65;) - &#<ENTITY> - &#x<ENTITY> - <VALID BUT UNTERMINATED DECIMAL ENTITY><ENTITY> (for example, &php#65&php#65;) - <VALID BUT UNTERMINATED HEX ENTITY><ENTITY> - <INVALID AND UNTERMINATED DECIMAL ENTITY><ENTITY> (it does not matter why the first entity is invalid; the value could be too big, it could have too many digits, or it could not match the 'convmap' parameter) - <INVALID AND UNTERMINATED HEX ENTITY><ENTITY> This is consistent with the way that web browsers process HTML entities.
1 parent cd0ea64 commit 898519a

File tree

2 files changed

+30
-63
lines changed

2 files changed

+30
-63
lines changed

ext/mbstring/mbstring.c

Lines changed: 8 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3408,11 +3408,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34083408
/* Invalid entity */
34093409
memcpy(converted, p, (p2 - p) * 4);
34103410
converted += p2 - p;
3411-
if ((p2 - p) == 3) {
3412-
/* For compatibility with legacy behavior; if &#x is followed
3413-
* by another entity, we don't convert the 2nd entity */
3414-
*converted++ = *p2++;
3415-
}
34163411
} else {
34173412
/* Valid hexadecimal entity */
34183413
uint32_t value = 0;
@@ -3430,16 +3425,12 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34303425
uint32_t original;
34313426
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
34323427
*converted++ = original;
3433-
if (*p2 != ';')
3434-
*converted++ = *p2;
3428+
if (*p2 == ';')
3429+
p2++;
34353430
} else {
3436-
memcpy(converted, p, (p2 - p + 1) * 4);
3437-
converted += p2 - p + 1;
3431+
memcpy(converted, p, (p2 - p) * 4);
3432+
converted += p2 - p;
34383433
}
3439-
/* For compatibility with legacy behavior:
3440-
* We don't consider & as starting a new entity if it
3441-
* terminates a preceding entity */
3442-
p2++;
34433434
}
34443435
} else if (p2 == wchar_buf + out_len && in_len) {
34453436
wchar_buf[0] = '&';
@@ -3469,11 +3460,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34693460
/* Invalid entity */
34703461
memcpy(converted, p, (p2 - p) * 4);
34713462
converted += p2 - p;
3472-
if ((p2 - p) == 2) {
3473-
/* For compatibility with legacy behavior; if &# is followed
3474-
* by another entity, we don't convert the 2nd entity */
3475-
*converted++ = *p2++;
3476-
}
34773463
} else {
34783464
/* Valid decimal entity */
34793465
uint32_t value = 0;
@@ -3492,16 +3478,12 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
34923478
uint32_t original;
34933479
if (html_numeric_entity_deconvert(value, convmap, mapsize, &original)) {
34943480
*converted++ = original;
3495-
if (*p2 != ';')
3496-
*converted++ = *p2;
3481+
if (*p2 == ';')
3482+
p2++;
34973483
} else {
3498-
memcpy(converted, p, (p2 - p + 1) * 4);
3499-
converted += p2 - p + 1;
3484+
memcpy(converted, p, (p2 - p) * 4);
3485+
converted += p2 - p;
35003486
}
3501-
/* For compatibility with legacy behavior:
3502-
* We don't consider & as starting a new entity if it
3503-
* terminates a preceding entity */
3504-
p2++;
35053487
}
35063488
}
35073489
} else if (p2 == wchar_buf + out_len) {
@@ -3516,17 +3498,6 @@ static zend_string* html_numeric_entity_decode(zend_string *input, const mbfl_en
35163498
}
35173499
} else {
35183500
*converted++ = '&';
3519-
if (*p2 == '&') {
3520-
/* Two successive ampersands convert to a single one,
3521-
* and we do NOT allow the second one to start an entity
3522-
* Duplicating the legacy behavior of mb_decode_numericentity here */
3523-
*converted++ = '&';
3524-
p2++;
3525-
if (p2 == wchar_buf + out_len) {
3526-
/* Corner case: two &'s at end of buffer */
3527-
goto process_converted_wchars_no_offset;
3528-
}
3529-
}
35303501
}
35313502
decimal_entity_too_big:
35323503

ext/mbstring/tests/mb_decode_numericentity.phpt

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -93,33 +93,28 @@ test("More than 8 digits for hex entity", "&#x000000141;", "&#x000000141;", [0,
9393

9494
test("Single &", "&", "&", [0, 0xFFFF, 0, 0xFFFF], "ASCII");
9595

96-
// We don't allow an entity to come right after a preceding ampersand
97-
// (This is for compatibility with the legacy behavior of mb_decode_numericentity)
98-
test("Successive &", "&&#2,", "&&#2,", [0xffe9ade7, 0x6237b6ff, 0xaa597469, 0x612800], 'ASCII');
96+
// An entity can come right after a preceding ampersand
97+
test("Successive &", "&&#65,", "&A,", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
9998

100-
// We don't allow an entity to come right after a preceding &#
101-
// (Also for compatibility with the legacy behavior of mb_decode_numericentity)
102-
test("Successive &#", "&#&#x32;", "&#&#x32;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
103-
test("Successive &#x", "&#x&#x32;", "&#x&#x32;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
99+
// An entity can come right after a preceding &#
100+
test("Successive &#", "&#&#x32;", "&#2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
101+
test("Successive &#x", "&#x&#x32;", "&#x2", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
104102

105-
// Don't allow the starting & of an entity to terminate a preceding entity
106-
// (Also for compatibility with the legacy behavior of mb_decode_numericentity)
107-
test("Successive &#65", "&#65&#65;", "A&#65;", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
103+
// The starting & of an entity can terminate a preceding entity
104+
test("Successive &#65", "&#65&#65;", "AA", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
105+
test("Successive hex entities", "&#x32&#x32;", "22", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
108106

109-
// An entity CAN come right after an entity which is invalid because of being too long
107+
// An entity can come right after an entity which is invalid because of being too long
110108
test("Starting entity immediately after decimal entity which is too long", "&#10000000000&#65;", "&#10000000000A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
111109
test("Starting entity immediately after hex entity which is too long", "&#x111111111&#65;", "&#x111111111A", [0, 0xFFFF, 0, 0xFFFF], 'ASCII');
112110

113-
// The second entity is not accepted here, because it terminates a preceding entity
114-
// (To test entities which are as large as possible without being too large, we need to use UCS-4;
115-
// any other encoding would not allow codepoints that large)
116111
$ucs4_test1 = mb_convert_encoding("&#1000000000&#65;", 'UCS-4BE', 'ASCII');
117-
testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00&\x00\x00\x00#\x00\x00\x006\x00\x00\x005\x00\x00\x00;", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
112+
testNonAscii("Starting entity immediately after valid decimal entity which is just within maximum length", $ucs4_test1, "\x3B\x9A\xCA\x00\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
118113
$ucs4_test2 = mb_convert_encoding("&#x11111111&#65;", 'UCS-4BE', 'ASCII');
119-
testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00&\x00\x00\x00#\x00\x00\x006\x00\x00\x005\x00\x00\x00;", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
114+
testNonAscii("Starting entity immediately after valid hex entity which is just within maximum length", $ucs4_test2, "\x11\x11\x11\x11\x00\x00\x00A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'UCS-4BE');
120115

121-
test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0&#65;", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
122-
test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0&#65;", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
116+
test("Starting entity immediately after invalid decimal entity", "&#0&#65;", "&#0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
117+
test("Starting entity immediately after invalid hex entity", "&#x0&#65;", "&#x0A", [0x1, 0xFFFF, 0, 0xFFFF], 'ASCII');
123118

124119
test("Starting entity immediately after too-big decimal entity", "&#7001492542&#65;", "&#7001492542A", [0, 0xFFFFFFFF, 0, 0xFFFFFFFF], 'ASCII');
125120

@@ -166,16 +161,17 @@ More than 10 digits for decimal entity: string(14) "&#00000000165;" => string(14
166161
8 digits for hex entity: string(12) "&#x00000041;" => string(1) "A" (Good)
167162
More than 8 digits for hex entity: string(13) "&#x000000141;" => string(13) "&#x000000141;" (Good)
168163
Single &: string(1) "&" => string(1) "&" (Good)
169-
Successive &: string(5) "&&#2," => string(5) "&&#2," (Good)
170-
Successive &#: string(8) "&#&#x32;" => string(8) "&#&#x32;" (Good)
171-
Successive &#x: string(9) "&#x&#x32;" => string(9) "&#x&#x32;" (Good)
172-
Successive &#65: string(9) "&#65&#65;" => string(6) "A&#65;" (Good)
164+
Successive &: string(6) "&&#65," => string(3) "&A," (Good)
165+
Successive &#: string(8) "&#&#x32;" => string(3) "&#2" (Good)
166+
Successive &#x: string(9) "&#x&#x32;" => string(4) "&#x2" (Good)
167+
Successive &#65: string(9) "&#65&#65;" => string(2) "AA" (Good)
168+
Successive hex entities: string(11) "&#x32&#x32;" => string(2) "22" (Good)
173169
Starting entity immediately after decimal entity which is too long: string(18) "&#10000000000&#65;" => string(14) "&#10000000000A" (Good)
174170
Starting entity immediately after hex entity which is too long: string(17) "&#x111111111&#65;" => string(13) "&#x111111111A" (Good)
175-
Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca00000000260000002300000036000000350000003b (Good)
176-
Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 11111111000000260000002300000036000000350000003b (Good)
177-
Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(8) "&#0&#65;" (Good)
178-
Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(9) "&#x0&#65;" (Good)
171+
Starting entity immediately after valid decimal entity which is just within maximum length: 000000260000002300000031000000300000003000000030000000300000003000000030000000300000003000000030000000260000002300000036000000350000003b => 3b9aca0000000041 (Good)
172+
Starting entity immediately after valid hex entity which is just within maximum length: 0000002600000023000000780000003100000031000000310000003100000031000000310000003100000031000000260000002300000036000000350000003b => 1111111100000041 (Good)
173+
Starting entity immediately after invalid decimal entity: string(8) "&#0&#65;" => string(4) "&#0A" (Good)
174+
Starting entity immediately after invalid hex entity: string(9) "&#x0&#65;" => string(5) "&#x0A" (Good)
179175
Starting entity immediately after too-big decimal entity: string(17) "&#7001492542&#65;" => string(13) "&#7001492542A" (Good)
180176
Regression test (entity which decodes to 0xFFFFFFFF): string(5) "&#xe;" => string(1) "?" (Good)
181177
Regression test (truncation of successive & with JIS encoding): string(3) "&&&" => string(3) "&&&" (Good)

0 commit comments

Comments
 (0)