diff options
author | Jukka Jylänki <jujjyl@gmail.com> | 2013-09-01 18:45:05 +0300 |
---|---|---|
committer | Alon Zakai <alonzakai@gmail.com> | 2013-09-10 14:39:31 -0700 |
commit | daebf68a22609cdce86f8dca699a0a945d56c9ab (patch) | |
tree | ac04e752557013205f780389cddc297b3eb77a1b | |
parent | 998bfbaf23a3ac894b7f57e237af529b08a5a2c3 (diff) |
Fix UTF32 <-> JS string conversion for non-BMP characters. Turns out JS string functions .length, .charCodeAt and .fromCharCode do not operate on Unicode code points, but UTF-16-encoded Unicode code units.
-rw-r--r-- | src/preamble.js | 27 |
1 files changed, 21 insertions, 6 deletions
diff --git a/src/preamble.js b/src/preamble.js index cbd50ade..3cf74b9d 100644 --- a/src/preamble.js +++ b/src/preamble.js @@ -580,19 +580,34 @@ function UTF32ToString(ptr) { if (utf32 == 0) return str; ++i; - str += String.fromCharCode(utf32); + // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing. + if (utf32 >= 0x10000) { + var ch = utf32 - 0x10000; + str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + } else { + str += String.fromCharCode(utf32); + } } } Module['UTF32ToString'] = UTF32ToString; // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', -// null-terminated and encoded in UTF32LE form. The copy will require (str.length+1)*4 bytes of space in the HEAP. +// null-terminated and encoded in UTF32LE form. The copy will require at most (str.length+1)*4 bytes of space in the HEAP, +// but can use less, since str.length does not return the number of characters in the string, but the number of UTF-16 code units in the string. function stringToUTF32(str, outPtr) { - for(var i = 0; i < str.length; ++i) { - var utf32 = str.charCodeAt(i); - {{{ makeSetValue('outPtr', 'i*4', 'utf32', 'i32') }}} + var iChar = 0; + for(var iCodeUnit = 0; iCodeUnit < str.length; ++iCodeUnit) { + // Gotcha: charCodeAt returns a 16-bit word that is a UTF-16 encoded code unit, not a Unicode code point of the character! We must decode the string to UTF-32 to the heap. + var codeUnit = str.charCodeAt(iCodeUnit); // possibly a lead surrogate + if (codeUnit >= 0xD800) { + var trailSurrogate = str.charCodeAt(++iCodeUnit); + codeUnit = 0x10000 + ((codeUnit & 0x3FF) << 10) | (trailSurrogate & 0x3FF); + } + {{{ makeSetValue('outPtr', 'iChar*4', 'codeUnit', 'i32') }}} + ++iChar; } - {{{ makeSetValue('outPtr', 'str.length*4', 0, 'i32') }}} + // Null-terminate the pointer to the HEAP. + {{{ makeSetValue('outPtr', 'iChar*4', 0, 'i32') }}} } Module['stringToUTF32'] = stringToUTF32; |