aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlon Zakai <alonzakai@gmail.com>2013-08-20 13:56:22 -0700
committerAlon Zakai <alonzakai@gmail.com>2013-08-20 13:56:22 -0700
commit6f2c31d4c86d168967e019bc6e731f6c454de91c (patch)
tree73f194d034c88cadf2bce70d53062dc68425e9b4
parenta0c1f8c193e4d5c318f04b886c52bc3e8ab06980 (diff)
parent7bf034909f6d117320680200d14d0937cabdd787 (diff)
Merge pull request #1537 from aidanhs/4-byte-utf8-chars
Fix printing of 4 byte UTF-8 characters
-rw-r--r--AUTHORS1
-rw-r--r--src/runtime.js40
-rw-r--r--tests/test_core.py12
3 files changed, 35 insertions, 18 deletions
diff --git a/AUTHORS b/AUTHORS
index 5161f7ad..6afe0918 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -93,4 +93,5 @@ a license to everyone to use it as detailed in LICENSE.)
* Yu Kobayashi <yukoba@accelart.jp>
* Pin Zhang <zhangpin04@gmail.com>
* Nick Bray <ncbray@chromium.org> (copyright owned by Google, Inc.)
+* Aidan Hobson Sayers <aidanhs@cantab.net>
diff --git a/src/runtime.js b/src/runtime.js
index e07d5054..33088ad9 100644
--- a/src/runtime.js
+++ b/src/runtime.js
@@ -386,35 +386,51 @@ var Runtime = {
// Returns a processor of UTF.
// processCChar() receives characters from a C-like UTF representation and returns JS string fragments.
+ // See RFC3629 for details, the bytes are assumed to be valid UTF-8
// processJSString() receives a JS string and returns a C-like UTF representation in an array
UTF8Processor: function() {
var buffer = [];
var needed = 0;
this.processCChar = function (code) {
- code = code & 0xff;
- if (needed) {
- buffer.push(code);
- needed--;
- }
+ code = code & 0xFF;
+
if (buffer.length == 0) {
- if (code < 128) return String.fromCharCode(code);
+ if ((code & 0x80) == 0x00) { // 0xxxxxxx
+ return String.fromCharCode(code);
+ }
buffer.push(code);
- if (code > 191 && code < 224) {
+ if ((code & 0xE0) == 0xC0) { // 110xxxxx
needed = 1;
- } else {
+ } else if ((code & 0xF0) == 0xE0) { // 1110xxxx
needed = 2;
+ } else { // 11110xxx
+ needed = 3;
}
return '';
}
- if (needed > 0) return '';
+
+ if (needed) {
+ buffer.push(code);
+ needed--;
+ if (needed > 0) return '';
+ }
+
var c1 = buffer[0];
var c2 = buffer[1];
var c3 = buffer[2];
+ var c4 = buffer[3];
var ret;
- if (c1 > 191 && c1 < 224) {
- ret = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63));
+ if (buffer.length == 2) {
+ ret = String.fromCharCode(((c1 & 0x1F) << 6) | (c2 & 0x3F));
+ } else if (buffer.length == 3) {
+ ret = String.fromCharCode(((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
} else {
- ret = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+ var codePoint = ((c1 & 0x07) << 18) | ((c2 & 0x3F) << 12) |
+ ((c3 & 0x3F) << 6) | (c4 & 0x3F);
+ ret = String.fromCharCode(
+ Math.floor((codePoint - 0x10000) / 0x400) + 0xD800,
+ (codePoint - 0x10000) % 0x400 + 0xDC00);
}
buffer.length = 0;
return ret;
diff --git a/tests/test_core.py b/tests/test_core.py
index f3ddc4f4..29a04e1e 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -7105,14 +7105,15 @@ def process(filename):
#include <emscripten.h>
int main() {
- char *c = "μ†ℱ ╋ℯ╳╋";
+ char *c = "μ†ℱ ╋ℯ╳╋ 😇";
printf("%d %d %d %d %s\n", c[0]&0xff, c[1]&0xff, c[2]&0xff, c[3]&0xff, c);
- emscripten_run_script("cheez = _malloc(100);"
- "Module.writeStringToMemory(\"μ†ℱ ╋ℯ╳╋\", cheez);"
- "Module.print([Pointer_stringify(cheez), Module.getValue(cheez, 'i8')&0xff, Module.getValue(cheez+1, 'i8')&0xff, Module.getValue(cheez+2, 'i8')&0xff, Module.getValue(cheez+3, 'i8')&0xff, ]);");
+ emscripten_run_script(
+ "cheez = _malloc(100);"
+ "Module.writeStringToMemory(\"μ†ℱ ╋ℯ╳╋ 😇\", cheez);"
+ "Module.print([Pointer_stringify(cheez), Module.getValue(cheez, 'i8')&0xff, Module.getValue(cheez+1, 'i8')&0xff, Module.getValue(cheez+2, 'i8')&0xff, Module.getValue(cheez+3, 'i8')&0xff, ]);");
}
'''
- self.do_run(src, '206 188 226 128 μ†ℱ ╋ℯ╳╋\nμ†ℱ ╋ℯ╳╋,206,188,226,128\n');
+ self.do_run(src, '206 188 226 128 μ†ℱ ╋ℯ╳╋ 😇\nμ†ℱ ╋ℯ╳╋ 😇,206,188,226,128\n');
def test_direct_string_constant_usage(self):
if self.emcc_args is None: return self.skip('requires libcxx')
@@ -10141,4 +10142,3 @@ for compiler, quantum, embetter, typed_arrays in [
locals()[fullname] = make_run(fullname, fullname, compiler, embetter, quantum, typed_arrays)
del T # T is just a shape for the specific subclasses, we don't test it itself
-