From e90b624b158bef01ea56999613d59662684edd2c Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Sun, 18 Aug 2013 02:33:35 +0100 Subject: Fix 4 byte utf8 characters --- src/runtime.js | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/runtime.js b/src/runtime.js index e07d5054..959cdfd5 100644 --- a/src/runtime.js +++ b/src/runtime.js @@ -386,35 +386,51 @@ var Runtime = { // Returns a processor of UTF. // processCChar() receives characters from a C-like UTF representation and returns JS string fragments. + // See RFC3629 for details, the bytes are assumed to be valid UTF-8 // processJSString() receives a JS string and returns a C-like UTF representation in an array UTF8Processor: function() { var buffer = []; var needed = 0; this.processCChar = function (code) { - code = code & 0xff; - if (needed) { - buffer.push(code); - needed--; - } + code = code & 0xFF; + if (buffer.length == 0) { - if (code < 128) return String.fromCharCode(code); + if ((code & 0x80) == 0) { // 0xxxxxxx + return String.fromCharCode(code); + } buffer.push(code); - if (code > 191 && code < 224) { + if (((code & 0xE0) ^ 0xC0) == 0) { // 110xxxxx needed = 1; - } else { + } else if (((code & 0xF0) ^ 0xE0) == 0) { // 1110xxxx needed = 2; + } else { // 11110xxx + needed = 3; } return ''; } + + if (needed) { + buffer.push(code); + needed--; + } + if (needed > 0) return ''; var c1 = buffer[0]; var c2 = buffer[1]; var c3 = buffer[2]; + var c4 = buffer[3]; var ret; - if (c1 > 191 && c1 < 224) { - ret = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63)); + if (buffer.length == 2) { + ret = String.fromCharCode(((c1 & 0x1F) << 6) | (c2 & 0x3F)); + } else if (buffer.length == 3) { + ret = String.fromCharCode(((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F)); } else { - ret = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + var codePoint = ((c1 & 0x07) << 18) | ((c2 & 0x3F) << 12) | + ((c3 & 0x3F) << 6) | (c4 & 0x3F); + ret = String.fromCharCode( + Math.floor((codePoint - 0x10000) / 0x400) + 0xD800, + (codePoint - 0x10000) % 0x400 + 0xDC00); } buffer.length = 0; return ret; -- cgit v1.2.3-18-g5258 From 09c1168629557ab46818785004bcb95b696e254c Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Tue, 20 Aug 2013 18:00:53 +0100 Subject: Clearer (and faster?), as per jij's suggestion --- src/runtime.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/runtime.js b/src/runtime.js index 959cdfd5..868aacb6 100644 --- a/src/runtime.js +++ b/src/runtime.js @@ -395,15 +395,15 @@ var Runtime = { code = code & 0xFF; if (buffer.length == 0) { - if ((code & 0x80) == 0) { // 0xxxxxxx + if ((code & 0x80) == 0x00) { // 0xxxxxxx return String.fromCharCode(code); } buffer.push(code); - if (((code & 0xE0) ^ 0xC0) == 0) { // 110xxxxx + if ((code & 0xE0) == 0xC0) { // 110xxxxx needed = 1; - } else if (((code & 0xF0) ^ 0xE0) == 0) { // 1110xxxx + } else if ((code & 0xF0) == 0xE0) { // 1110xxxx needed = 2; - } else { // 11110xxx + } else { // 11110xxx needed = 3; } return ''; -- cgit v1.2.3-18-g5258 From 7bf034909f6d117320680200d14d0937cabdd787 Mon Sep 17 00:00:00 2001 From: Aidan Hobson Sayers Date: Tue, 20 Aug 2013 20:19:17 +0100 Subject: Only check needed > 0 if it was formerly > 0 --- src/runtime.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/runtime.js b/src/runtime.js index 868aacb6..33088ad9 100644 --- a/src/runtime.js +++ b/src/runtime.js @@ -412,9 +412,9 @@ var Runtime = { if (needed) { buffer.push(code); needed--; + if (needed > 0) return ''; } - if (needed > 0) return ''; var c1 = buffer[0]; var c2 = buffer[1]; var c3 = buffer[2]; -- cgit v1.2.3-18-g5258