Skip to content

Commit 0c0774f

Browse files
committed
Use fast text conversion filters for mb_strpos, mb_stripos, mb_substr, etc
This boosts the performance of mb_strpos, mb_stripos, mb_strrpos, mb_strripos, mb_strstr, mb_stristr, mb_strrchr, and mb_strrichr when used on non-UTF-8 strings. mb_substr is also faster. With UTF-8 input, there is no appreciable difference in performance for mb_strpos, mb_stripos, mb_strrpos, etc. This is expected, since the only real difference here (aside from shorter and simpler code) is that the new text conversion code is used when converting non-UTF-8 input strings to UTF-8. (This is done because internally, mb_strpos, etc. work only on UTF-8 text.) For ASCII, speed is boosted by 30-65%. For other legacy text encodings, the degree of performance improvement will depend on how slow the legacy conversion code was. One other minor, but notable difference is that strings encoded using UTF-8 variants from Japanese mobile vendors (SoftBank, KDDI, Docomo) will not undergo encoding conversion but will be processed "as is". It is expected that this will result in a large performance boost for such input strings; but realistically, the number of users who work with such strings is probably minute. I was not originally planning to include mb_substr in this commit, but fuzzing of the reimplemented mb_strstr revealed that mb_substr needed to be reimplemented, too; using the old mbfl_substr, which was based on the old text conversion filters, in combination with functions which use the new text conversion filters caused bugs. The performance boost for mb_substr varies from 10%-500%, depending on the encoding and input string used.
1 parent b96b88b commit 0c0774f

File tree

8 files changed

+265
-555
lines changed

8 files changed

+265
-555
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 0 additions & 336 deletions
Original file line numberDiff line numberDiff line change
@@ -429,42 +429,6 @@ const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_enco
429429
return enc;
430430
}
431431

432-
/*
433-
* strlen
434-
*/
435-
size_t mbfl_strlen(const mbfl_string *string)
436-
{
437-
size_t len = 0;
438-
const mbfl_encoding *encoding = string->encoding;
439-
440-
if (encoding->flag & MBFL_ENCTYPE_SBCS) {
441-
len = string->len;
442-
} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
443-
len = string->len/2;
444-
} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
445-
len = string->len/4;
446-
} else if (encoding->mblen_table) {
447-
const unsigned char *mbtab = encoding->mblen_table;
448-
unsigned char *p = string->val, *e = p + string->len;
449-
while (p < e) {
450-
p += mbtab[*p];
451-
len++;
452-
}
453-
} else {
454-
uint32_t wchar_buf[128];
455-
unsigned char *in = string->val;
456-
size_t in_len = string->len;
457-
unsigned int state = 0;
458-
459-
while (in_len) {
460-
len += encoding->to_wchar(&in, &in_len, wchar_buf, 128, &state);
461-
}
462-
}
463-
464-
return len;
465-
}
466-
467-
468432
/*
469433
* strpos
470434
*/
@@ -528,136 +492,6 @@ collector_strpos(int c, void* data)
528492
return 0;
529493
}
530494

531-
static const unsigned char *mbfl_find_offset_utf8(
532-
const unsigned char *str, const unsigned char *end, ssize_t offset) {
533-
if (offset < 0) {
534-
const unsigned char *pos = end;
535-
while (offset < 0) {
536-
if (pos <= str) {
537-
return NULL;
538-
}
539-
540-
unsigned char c = *(--pos);
541-
if (c < 0x80) {
542-
++offset;
543-
} else if ((c & 0xc0) != 0x80) {
544-
++offset;
545-
}
546-
}
547-
return pos;
548-
} else {
549-
const unsigned char *u8_tbl = mbfl_encoding_utf8.mblen_table;
550-
const unsigned char *pos = str;
551-
while (offset-- > 0) {
552-
if (pos >= end) {
553-
return NULL;
554-
}
555-
pos += u8_tbl[*pos];
556-
}
557-
return pos;
558-
}
559-
}
560-
561-
static size_t mbfl_pointer_to_offset_utf8(const unsigned char *start, const unsigned char *pos) {
562-
size_t result = 0;
563-
while (pos > start) {
564-
unsigned char c = *--pos;
565-
if (c < 0x80) {
566-
++result;
567-
} else if ((c & 0xc0) != 0x80) {
568-
++result;
569-
}
570-
}
571-
return result;
572-
}
573-
574-
size_t
575-
mbfl_strpos(
576-
mbfl_string *haystack,
577-
mbfl_string *needle,
578-
ssize_t offset,
579-
int reverse)
580-
{
581-
size_t result;
582-
mbfl_string _haystack_u8, _needle_u8;
583-
const mbfl_string *haystack_u8, *needle_u8 = NULL;
584-
const unsigned char *offset_pointer;
585-
586-
if (haystack->encoding->no_encoding != mbfl_no_encoding_utf8) {
587-
mbfl_string_init(&_haystack_u8);
588-
haystack_u8 = mbfl_convert_encoding(haystack, &_haystack_u8, &mbfl_encoding_utf8);
589-
if (haystack_u8 == NULL) {
590-
result = MBFL_ERROR_ENCODING;
591-
goto out;
592-
}
593-
} else {
594-
haystack_u8 = haystack;
595-
}
596-
597-
if (needle->encoding->no_encoding != mbfl_no_encoding_utf8) {
598-
mbfl_string_init(&_needle_u8);
599-
needle_u8 = mbfl_convert_encoding(needle, &_needle_u8, &mbfl_encoding_utf8);
600-
if (needle_u8 == NULL) {
601-
result = MBFL_ERROR_ENCODING;
602-
goto out;
603-
}
604-
} else {
605-
needle_u8 = needle;
606-
}
607-
608-
offset_pointer = mbfl_find_offset_utf8(
609-
haystack_u8->val, haystack_u8->val + haystack_u8->len, offset);
610-
if (!offset_pointer) {
611-
result = MBFL_ERROR_OFFSET;
612-
goto out;
613-
}
614-
615-
result = MBFL_ERROR_NOT_FOUND;
616-
if (haystack_u8->len < needle_u8->len) {
617-
goto out;
618-
}
619-
620-
const char *found_pos;
621-
if (!reverse) {
622-
found_pos = zend_memnstr(
623-
(const char *) offset_pointer,
624-
(const char *) needle_u8->val, needle_u8->len,
625-
(const char *) haystack_u8->val + haystack_u8->len);
626-
} else {
627-
if (offset >= 0) {
628-
found_pos = zend_memnrstr(
629-
(const char *) offset_pointer,
630-
(const char *) needle_u8->val, needle_u8->len,
631-
(const char *) haystack_u8->val + haystack_u8->len);
632-
} else {
633-
size_t needle_len = mbfl_strlen(needle_u8);
634-
offset_pointer = mbfl_find_offset_utf8(
635-
offset_pointer, haystack_u8->val + haystack_u8->len, needle_len);
636-
if (!offset_pointer) {
637-
offset_pointer = haystack_u8->val + haystack_u8->len;
638-
}
639-
640-
found_pos = zend_memnrstr(
641-
(const char *) haystack_u8->val,
642-
(const char *) needle_u8->val, needle_u8->len,
643-
(const char *) offset_pointer);
644-
}
645-
}
646-
647-
if (found_pos) {
648-
result = mbfl_pointer_to_offset_utf8(haystack_u8->val, (const unsigned char *) found_pos);
649-
}
650-
651-
out:
652-
if (haystack_u8 == &_haystack_u8) {
653-
mbfl_string_clear(&_haystack_u8);
654-
}
655-
if (needle_u8 == &_needle_u8) {
656-
mbfl_string_clear(&_needle_u8);
657-
}
658-
return result;
659-
}
660-
661495
/*
662496
* substr_count
663497
*/
@@ -727,176 +561,6 @@ mbfl_substr_count(
727561
return result;
728562
}
729563

730-
/*
731-
* substr
732-
*/
733-
struct collector_substr_data {
734-
mbfl_convert_filter *next_filter;
735-
size_t start;
736-
size_t stop;
737-
size_t output;
738-
};
739-
740-
static int
741-
collector_substr(int c, void* data)
742-
{
743-
struct collector_substr_data *pc = (struct collector_substr_data*)data;
744-
745-
if (pc->output >= pc->stop) {
746-
return -1;
747-
}
748-
749-
if (pc->output >= pc->start) {
750-
(*pc->next_filter->filter_function)(c, pc->next_filter);
751-
}
752-
753-
pc->output++;
754-
755-
return 0;
756-
}
757-
758-
mbfl_string *
759-
mbfl_substr(
760-
mbfl_string *string,
761-
mbfl_string *result,
762-
size_t from,
763-
size_t length)
764-
{
765-
const mbfl_encoding *encoding = string->encoding;
766-
size_t n, k, len, start, end;
767-
unsigned m;
768-
unsigned char *p, *w;
769-
770-
mbfl_string_init(result);
771-
result->encoding = string->encoding;
772-
773-
if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) ||
774-
encoding->mblen_table != NULL) {
775-
len = string->len;
776-
if (encoding->flag & MBFL_ENCTYPE_SBCS) {
777-
start = from;
778-
} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
779-
start = from*2;
780-
} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
781-
start = from*4;
782-
} else {
783-
const unsigned char *mbtab = encoding->mblen_table;
784-
start = 0;
785-
n = 0;
786-
k = 0;
787-
p = string->val;
788-
/* search start position */
789-
while (k <= from) {
790-
start = n;
791-
if (n >= len) {
792-
break;
793-
}
794-
m = mbtab[*p];
795-
n += m;
796-
p += m;
797-
k++;
798-
}
799-
}
800-
801-
if (length == MBFL_SUBSTR_UNTIL_END) {
802-
end = len;
803-
} else if (encoding->flag & MBFL_ENCTYPE_SBCS) {
804-
end = start + length;
805-
} else if (encoding->flag & MBFL_ENCTYPE_WCS2) {
806-
end = start + length*2;
807-
} else if (encoding->flag & MBFL_ENCTYPE_WCS4) {
808-
end = start + length*4;
809-
} else {
810-
const unsigned char *mbtab = encoding->mblen_table;
811-
end = start;
812-
n = start;
813-
k = 0;
814-
p = string->val + start;
815-
/* detect end position */
816-
while (k <= length) {
817-
end = n;
818-
if (n >= len) {
819-
break;
820-
}
821-
m = mbtab[*p];
822-
n += m;
823-
p += m;
824-
k++;
825-
}
826-
}
827-
828-
if (start > len) {
829-
start = len;
830-
}
831-
if (end > len) {
832-
end = len;
833-
}
834-
if (start > end) {
835-
start = end;
836-
}
837-
838-
/* allocate memory and copy */
839-
n = end - start;
840-
result->len = 0;
841-
result->val = w = (unsigned char*)emalloc(n + 1);
842-
result->len = n;
843-
memcpy(w, string->val + start, n);
844-
w[n] = '\0';
845-
} else {
846-
mbfl_memory_device device;
847-
struct collector_substr_data pc;
848-
mbfl_convert_filter *decoder;
849-
mbfl_convert_filter *encoder;
850-
851-
if (length == MBFL_SUBSTR_UNTIL_END) {
852-
length = mbfl_strlen(string) - from;
853-
}
854-
855-
mbfl_memory_device_init(&device, length + 1, 0);
856-
mbfl_string_init(result);
857-
result->encoding = string->encoding;
858-
/* output code filter */
859-
decoder = mbfl_convert_filter_new(
860-
&mbfl_encoding_wchar,
861-
string->encoding,
862-
mbfl_memory_device_output, 0, &device);
863-
/* wchar filter */
864-
encoder = mbfl_convert_filter_new(
865-
string->encoding,
866-
&mbfl_encoding_wchar,
867-
collector_substr, 0, &pc);
868-
if (decoder == NULL || encoder == NULL) {
869-
mbfl_convert_filter_delete(encoder);
870-
mbfl_convert_filter_delete(decoder);
871-
return NULL;
872-
}
873-
pc.next_filter = decoder;
874-
pc.start = from;
875-
pc.stop = from + length;
876-
pc.output = 0;
877-
878-
/* feed data */
879-
p = string->val;
880-
n = string->len;
881-
if (p != NULL) {
882-
while (n > 0) {
883-
if ((*encoder->filter_function)(*p++, encoder) < 0) {
884-
break;
885-
}
886-
n--;
887-
}
888-
}
889-
890-
mbfl_convert_filter_flush(encoder);
891-
mbfl_convert_filter_flush(decoder);
892-
result = mbfl_memory_device_result(&device, result);
893-
mbfl_convert_filter_delete(encoder);
894-
mbfl_convert_filter_delete(decoder);
895-
}
896-
897-
return result;
898-
}
899-
900564
/*
901565
* strcut
902566
*/

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -190,24 +190,11 @@ static inline int mbfl_is_error(size_t len) {
190190
return len >= (size_t) -16;
191191
}
192192

193-
/*
194-
* strlen
195-
*/
196-
MBFLAPI extern size_t
197-
mbfl_strlen(const mbfl_string *string);
198-
199193
#define MBFL_ERROR_NOT_FOUND ((size_t) -1)
200194
#define MBFL_ERROR_ENCODING ((size_t) -4)
201195
#define MBFL_ERROR_EMPTY ((size_t) -8)
202196
#define MBFL_ERROR_OFFSET ((size_t) -16)
203197

204-
/*
205-
* strpos.
206-
* Errors: MBFL_ERROR_NOT_FOUND, MBFL_ERROR_ENCODING, MBFL_ERROR_OFFSET
207-
*/
208-
MBFLAPI extern size_t
209-
mbfl_strpos(mbfl_string *haystack, mbfl_string *needle, ssize_t offset, int reverse);
210-
211198
/*
212199
* substr_count
213200
*/
@@ -219,12 +206,6 @@ mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle);
219206
*/
220207
#define MBFL_SUBSTR_UNTIL_END ((size_t) -1)
221208

222-
/*
223-
* substr
224-
*/
225-
MBFLAPI extern mbfl_string *
226-
mbfl_substr(mbfl_string *string, mbfl_string *result, size_t from, size_t length);
227-
228209
/*
229210
* strcut
230211
*/

0 commit comments

Comments
 (0)