Skip to content

Commit c6a7549

Browse files
author
Rui Hirokawa
committed
fixed #65045: mb_convert_encoding breaks well-formed character.
1 parent 4d606cf commit c6a7549

File tree

3 files changed

+177
-193
lines changed

3 files changed

+177
-193
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf8.c

Lines changed: 94 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
7979
mbfl_filt_conv_common_ctor,
8080
mbfl_filt_conv_common_dtor,
8181
mbfl_filt_conv_utf8_wchar,
82-
mbfl_filt_conv_common_flush
82+
mbfl_filt_conv_utf8_wchar_flush
8383
};
8484

8585
const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
@@ -93,118 +93,122 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
9393

9494
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
9595

96+
int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter)
97+
{
98+
int w;
99+
w = c & MBFL_WCSGROUP_MASK;
100+
w |= MBFL_WCSGROUP_THROUGH;
101+
filter->status = 0;
102+
filter->cache = 0;
103+
CK((*filter->output_function)(w, filter->data));
104+
}
105+
106+
96107
/*
97108
* UTF-8 => wchar
98109
*/
99110
int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
100111
{
101112
int s, c1, w = 0, flag = 0;
102113

103-
if (c < 0x80) {
104-
if (filter->status != 0) {
105-
w = (filter->cache & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;
106-
CK((*filter->output_function)(w, filter->data));
107-
filter->status = 0;
108-
filter->cache = 0;
109-
}
110-
if (c >= 0) {
114+
retry:
115+
switch (filter->status & 0xff) {
116+
case 0x00:
117+
if (c < 0x80) {
111118
CK((*filter->output_function)(c, filter->data));
119+
} else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */
120+
filter->status = 0x10;
121+
filter->cache = c & 0x1f;
122+
} else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */
123+
filter->status = 0x20;
124+
filter->cache = c & 0xf;
125+
} else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */
126+
filter->status = 0x30;
127+
filter->cache = c & 0x7;
128+
} else {
129+
mbfl_filt_put_invalid_char(c, filter);
112130
}
113-
} else if (c < 0xc0) {
114-
int status = filter->status & 0xff;
115-
switch (status) {
116-
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
117-
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
118-
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
119-
filter->status = 0;
120-
s = filter->cache | (c & 0x3f);
131+
break;
132+
case 0x10: /* 2byte code 2nd char: 0x80-0xbf */
133+
case 0x21: /* 3byte code 3rd char: 0x80-0xbf */
134+
case 0x32: /* 4byte code 4th char: 0x80-0xbf */
135+
filter->status = 0;
136+
if (c >= 0x80 && c <= 0xbf) {
137+
s = (filter->cache<<6) | (c & 0x3f);
121138
filter->cache = 0;
122-
if ((status == 0x10 && s >= 0x80) ||
123-
(status == 0x21 && s >= 0x800 && (s < 0xd800 || s > 0xdfff)) ||
124-
(status == 0x32 && s >= 0x10000 && s < 0x110000)) {
125-
CK((*filter->output_function)(s, filter->data));
126-
} else {
127-
w = s & MBFL_WCSGROUP_MASK;
128-
flag = 1;
129-
}
130-
break;
131-
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
132-
s = filter->cache | ((c & 0x3f) << 6);
133-
c1 = (s >> 12) & 0xf;
134-
if ((c1 == 0x0 && c >= 0xa0) ||
135-
(c1 == 0xd && c < 0xa0) ||
136-
(c1 > 0x0 && c1 != 0xd)) {
137-
filter->cache = s;
138-
filter->status++;
139-
} else {
140-
w = s & MBFL_WCSGROUP_MASK;
141-
flag = 1;
142-
}
143-
break;
144-
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
145-
filter->cache |= ((c & 0x3f) << 6);
146-
filter->status++;
147-
break;
148-
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
149-
s = filter->cache | ((c & 0x3f) << 12);
150-
c1 = (s >> 18) & 0x7;
151-
if ((c1 == 0x0 && c >= 0x90) ||
152-
(c1 > 0x0 && c1 < 0x4) ||
153-
(c1 == 0x4 && c < 0x90)) {
154-
filter->cache = s;
155-
filter->status++;
156-
} else {
157-
w = s & MBFL_WCSGROUP_MASK;
158-
flag = 1;
159-
}
160-
break;
161-
default:
162-
w = c & MBFL_WCSGROUP_MASK;
163-
flag = 1;
164-
break;
139+
CK((*filter->output_function)(s, filter->data));
140+
} else {
141+
mbfl_filt_put_invalid_char(filter->cache, filter);
142+
goto retry;
165143
}
166-
} else if (c < 0xc2) { /* invalid: 0xc0,0xc1 */
167-
w = c & MBFL_WCSGROUP_MASK;
168-
flag = 1;
169-
} else if (c < 0xe0) { /* 2byte code first char: 0xc2-0xdf */
170-
if (filter->status == 0x0) {
171-
filter->status = 0x10;
172-
filter->cache = (c & 0x1f) << 6;
144+
break;
145+
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
146+
s = (filter->cache<<6) | (c & 0x3f);
147+
c1 = filter->cache & 0xf;
148+
149+
if ((c >= 0x80 && c <= 0xbf) &&
150+
((c1 == 0x0 && c >= 0xa0) ||
151+
(c1 == 0xd && c < 0xa0) ||
152+
(c1 > 0x0 && c1 != 0xd))) {
153+
filter->cache = s;
154+
filter->status++;
173155
} else {
174-
w = c & MBFL_WCSGROUP_MASK;
175-
flag = 1;
156+
mbfl_filt_put_invalid_char(filter->cache, filter);
157+
goto retry;
176158
}
177-
} else if (c < 0xf0) { /* 3byte code first char: 0xe0-0xef */
178-
if (filter->status == 0x0) {
179-
filter->status = 0x20;
180-
filter->cache = (c & 0xf) << 12;
159+
break;
160+
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
161+
s = (filter->cache<<6) | (c & 0x3f);
162+
c1 = filter->cache & 0x7;
163+
164+
if ((c >= 0x80 && c <= 0xbf) &&
165+
((c1 == 0x0 && c >= 0x90) ||
166+
(c1 == 0x4 && c < 0x90) ||
167+
(c1 > 0x0 && c1 != 0x4))) {
168+
filter->cache = s;
169+
filter->status++;
181170
} else {
182-
w = c & MBFL_WCSGROUP_MASK;
183-
flag = 1;
171+
mbfl_filt_put_invalid_char(filter->cache, filter);
172+
goto retry;
184173
}
185-
} else if (c < 0xf5) { /* 4byte code first char: 0xf0-0xf4 */
186-
if (filter->status == 0x0) {
187-
filter->status = 0x30;
188-
filter->cache = (c & 0x7) << 18;
174+
break;
175+
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
176+
if (c >= 0x80 && c <= 0xbf) {
177+
filter->cache = (filter->cache<<6) | (c & 0x3f);
178+
filter->status++;
189179
} else {
190-
w = c & MBFL_WCSGROUP_MASK;
191-
flag = 1;
180+
mbfl_filt_put_invalid_char(filter->cache, filter);
181+
goto retry;
192182
}
193-
} else {
194-
w = c & MBFL_WCSGROUP_MASK;
195-
flag = 1;
196-
}
197-
198-
if (flag) {
199-
w |= MBFL_WCSGROUP_THROUGH;
200-
CK((*filter->output_function)(w, filter->data));
183+
break;
184+
default:
201185
filter->status = 0;
202-
filter->cache = 0;
186+
break;
203187
}
204188

205189
return c;
206190
}
207191

192+
int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
193+
{
194+
int status, cache;
195+
196+
status = filter->status;
197+
cache = filter->cache;
198+
199+
filter->status = 0;
200+
filter->cache = 0;
201+
202+
if (status != 0) {
203+
mbfl_filt_put_invalid_char(cache, filter);
204+
}
205+
206+
if (filter->flush_function != NULL) {
207+
(*filter->flush_function)(filter->data);
208+
}
209+
return 0;
210+
}
211+
208212
/*
209213
* wchar => UTF-8
210214
*/

ext/mbstring/libmbfl/filters/mbfilter_utf8.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,6 @@ extern const struct mbfl_convert_vtbl vtbl_wchar_utf8;
3737

3838
int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter);
3939
int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter);
40+
int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter);
4041

4142
#endif /* MBFL_MBFILTER_UTF8_H */

0 commit comments

Comments
 (0)