aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlon Zakai <alonzakai@gmail.com>2013-08-20 13:56:22 -0700
committerAlon Zakai <alonzakai@gmail.com>2013-08-20 13:56:22 -0700
commit6f2c31d4c86d168967e019bc6e731f6c454de91c (patch)
tree73f194d034c88cadf2bce70d53062dc68425e9b4 /src
parenta0c1f8c193e4d5c318f04b886c52bc3e8ab06980 (diff)
parent7bf034909f6d117320680200d14d0937cabdd787 (diff)
Merge pull request #1537 from aidanhs/4-byte-utf8-chars
Fix printing of 4 byte UTF-8 characters
Diffstat (limited to 'src')
-rw-r--r--src/runtime.js40
1 files changed, 28 insertions, 12 deletions
diff --git a/src/runtime.js b/src/runtime.js
index e07d5054..33088ad9 100644
--- a/src/runtime.js
+++ b/src/runtime.js
@@ -386,35 +386,51 @@ var Runtime = {
// Returns a processor of UTF.
// processCChar() receives characters from a C-like UTF representation and returns JS string fragments.
+ // See RFC3629 for details, the bytes are assumed to be valid UTF-8
// processJSString() receives a JS string and returns a C-like UTF representation in an array
UTF8Processor: function() {
var buffer = [];
var needed = 0;
this.processCChar = function (code) {
- code = code & 0xff;
- if (needed) {
- buffer.push(code);
- needed--;
- }
+ code = code & 0xFF;
+
if (buffer.length == 0) {
- if (code < 128) return String.fromCharCode(code);
+ if ((code & 0x80) == 0x00) { // 0xxxxxxx
+ return String.fromCharCode(code);
+ }
buffer.push(code);
- if (code > 191 && code < 224) {
+ if ((code & 0xE0) == 0xC0) { // 110xxxxx
needed = 1;
- } else {
+ } else if ((code & 0xF0) == 0xE0) { // 1110xxxx
needed = 2;
+ } else { // 11110xxx
+ needed = 3;
}
return '';
}
- if (needed > 0) return '';
+
+ if (needed) {
+ buffer.push(code);
+ needed--;
+ if (needed > 0) return '';
+ }
+
var c1 = buffer[0];
var c2 = buffer[1];
var c3 = buffer[2];
+ var c4 = buffer[3];
var ret;
- if (c1 > 191 && c1 < 224) {
- ret = String.fromCharCode(((c1 & 31) << 6) | (c2 & 63));
+ if (buffer.length == 2) {
+ ret = String.fromCharCode(((c1 & 0x1F) << 6) | (c2 & 0x3F));
+ } else if (buffer.length == 3) {
+ ret = String.fromCharCode(((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
} else {
- ret = String.fromCharCode(((c1 & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+ var codePoint = ((c1 & 0x07) << 18) | ((c2 & 0x3F) << 12) |
+ ((c3 & 0x3F) << 6) | (c4 & 0x3F);
+ ret = String.fromCharCode(
+ Math.floor((codePoint - 0x10000) / 0x400) + 0xD800,
+ (codePoint - 0x10000) % 0x400 + 0xDC00);
}
buffer.length = 0;
return ret;