-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Don't write 0 to the end of strings in embind #10844
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,7 @@ var UTF16Decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-16l | |
| #endif // TEXTDECODER | ||
| #endif // TEXTDECODER == 2 | ||
|
|
||
| function UTF16ToString(ptr) { | ||
| function UTF16ToString(ptr, maxBytesToRead) { | ||
| #if ASSERTIONS | ||
| assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!'); | ||
| #endif | ||
|
|
@@ -48,7 +48,10 @@ function UTF16ToString(ptr) { | |
| // TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself. | ||
| // Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage. | ||
| var idx = endPtr >> 1; | ||
| while (HEAP16[idx]) ++idx; | ||
| var maxIdx = idx + maxBytesToRead / 2; | ||
| // If maxBytesToRead is not passed explicitly, it will be undefined, and this | ||
| // will always evaluate to true. This saves on code size. | ||
| while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx; | ||
| endPtr = idx << 1; | ||
|
|
||
| #if TEXTDECODER != 2 | ||
|
|
@@ -64,7 +67,7 @@ function UTF16ToString(ptr) { | |
| var str = ''; | ||
| while (1) { | ||
| var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}}; | ||
| if (codeUnit == 0) return str; | ||
| if (codeUnit == 0 || i == maxBytesToRead / 2) return str; | ||
| ++i; | ||
| // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through. | ||
| str += String.fromCharCode(codeUnit); | ||
|
|
@@ -117,16 +120,18 @@ function lengthBytesUTF16(str) { | |
| return str.length*2; | ||
| } | ||
|
|
||
| function UTF32ToString(ptr) { | ||
| function UTF32ToString(ptr, maxBytesToRead) { | ||
| #if ASSERTIONS | ||
| assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!'); | ||
| #endif | ||
| var i = 0; | ||
|
|
||
| var str = ''; | ||
| while (1) { | ||
| // If maxBytesToRead is not passed explicitly, it will be undefined, and this | ||
| // will always evaluate to true. This saves on code size. | ||
| while (!(i >= maxBytesToRead / 4)) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likewise here, having a per-loop iteration division is not good for perf. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is safe to assume the JS VM will do loop invariant code motion on such things, it's been standard for many years since the JS speed wars. And it can be more compact this way instead of defining a new variable (unless we also need that variable elsewhere). |
||
| var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}}; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These makeGetValues could be demoted to direct HEAPU32/HEAPU16 reads for better performance and smaller code size, although could leave that for a later day as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The compiler turns those
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it is able to optimize away the *4, so it will end up with a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good question, yeah, maybe not. Sounds good to iterate over |
||
| if (utf32 == 0) return str; | ||
| if (utf32 == 0) break; | ||
| ++i; | ||
| // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing. | ||
| // See http://unicode.org/faq/utf_bom.html#utf16-3 | ||
|
|
@@ -137,6 +142,7 @@ function UTF32ToString(ptr) { | |
| str += String.fromCharCode(utf32); | ||
| } | ||
| } | ||
| return str; | ||
| } | ||
|
|
||
| // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this not be
i == maxIdx? Having a division in the hot per-characterr loop would be bad for performance.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also note the inconsistency in implementation between UTF16ToString() and UTF32ToString: this retains a
while(1)whereas UTF32ToString switched to awhile (!(i >= maxBytesToRead / 4)) {. This variant is better for code size and less branches (termination test is preserved in one if, whereas it is in two parts in UTF32ToString), let's use UTF16ToString() format in both?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 for a followup to make the code more consistent.