29 files changed, 2748 insertions, 205 deletions
diff --git a/AUTHORS b/AUTHORS
index c157c468..ef611d55 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -50,3 +50,5 @@ a license to everyone to use it as detailed in LICENSE.)
 * Bruce Mitchener, Jr. <bruce.mitchener@gmail.com>
 * Michael Bishop <mbtyke@gmail.com>
 * Roger Braun <roger@rogerbraun.net>
+* Vladimir Vukicevic <vladimir@pobox.com> (copyright owned by Mozilla Foundation)
+
diff --git a/emcc b/emcc
index 64c65d47..03b18242 100755
--- a/emcc
+++ b/emcc
@@ -1,4 +1,5 @@
 #!/usr/bin/env python2
+# -*- Mode: python -*-
 
 '''
 emcc - compiler helper script
@@ -118,6 +119,28 @@ if len(sys.argv) == 1:
   print 'emcc: no input files'
   exit(1)
 
+# read response files very early on
+response_file = True
+while response_file:
+  response_file = None
+  for index in range(1, len(sys.argv)):
+    if sys.argv[index][0] == '@':
+      # found one, loop again next time
+      print >>sys.stderr, 'emcc: using response file: %s' % response_file
+      response_file = sys.argv[index][1:]
+      if not os.path.exists(response_file):
+        print >>sys.stderr, 'emcc: error: Response file not found: %s' % response_file
+        exit(1)
+
+      response_fd = open(response_file, 'r')
+      extra_args = shlex.split(response_fd.read())
+      response_fd.close()
+
+      # slice in extra_args in place of the response file arg
+      sys.argv[index:index+1] = extra_args
+      #if DEBUG: print >>sys.stderr, "Expanded response file: " + " | ".join(sys.argv)
+      break
+
 if sys.argv[1] == '--version':
   revision = '(unknown revision)'
   here = os.getcwd()
@@ -1054,29 +1077,32 @@ try:
     libcxxabi_symbols = filter(lambda symbol: symbol not in libc_symbols, libcxxabi_symbols)
     libcxxabi_symbols = set(libcxxabi_symbols)
 
-    force = False # If we have libcxx, we must force inclusion of libc, since libcxx uses new internally. Note: this is kind of hacky
-
+    # If we have libcxx, we must force inclusion of libc, since libcxx uses new internally. Note: this is kind of hacky
+    # Settings this in the environment will avoid checking dependencies and make building big projects a little faster
+    force = os.environ.get('EMCC_FORCE_STDLIBS')
+    has = need = None
     for name, create, fix, library_symbols in [('libcxx',    create_libcxx,    fix_libcxx,    libcxx_symbols),
                                                ('libcxxabi', create_libcxxabi, fix_libcxxabi, libcxxabi_symbols),
                                                ('libc',      create_libc,      fix_libc,      libc_symbols)]:
-      need = set()
-      has = set()
-      for temp_file in temp_files:
-        symbols = shared.Building.llvm_nm(temp_file)
-        for library_symbol in library_symbols:
-          if library_symbol in symbols.undefs:
-            need.add(library_symbol)
-          if library_symbol in symbols.defs:
-            has.add(library_symbol)
-      for haz in has: # remove symbols that are supplied by another of the inputs
-        if haz in need:
-          need.remove(haz)
-      if DEBUG: print >> sys.stderr, 'emcc: considering including %s: we need %s and have %s' % (name, str(need), str(has))
+      if not force:
+        need = set()
+        has = set()
+        for temp_file in temp_files:
+          symbols = shared.Building.llvm_nm(temp_file)
+          for library_symbol in library_symbols:
+            if library_symbol in symbols.undefs:
+              need.add(library_symbol)
+            if library_symbol in symbols.defs:
+              has.add(library_symbol)
+        for haz in has: # remove symbols that are supplied by another of the inputs
+          if haz in need:
+            need.remove(haz)
+        if DEBUG: print >> sys.stderr, 'emcc: considering including %s: we need %s and have %s' % (name, str(need), str(has))
       if force or len(need) > 0:
         # We need to build and link the library in
         if DEBUG: print >> sys.stderr, 'emcc: including %s' % name
         libfile = shared.Cache.get(name, create)
-        if len(has) > 0:
+        if has and len(has) > 0:
           # remove the symbols we do not need
           fixed = in_temp(uniquename(libfile)) + '.bc'
           shutil.copyfile(libfile, fixed)
@@ -1086,7 +1112,7 @@ try:
           libfile = fixed
         extra_files_to_link.append(libfile)
         force = True
-        if fix:
+        if fix and need:
           fix(need)
 
   # First, combine the bitcode files if there are several. We must also link if we have a singleton .a
diff --git a/emscripten.py b/emscripten.py
index af762a21..dc5d5f5b 100755
--- a/emscripten.py
+++ b/emscripten.py
@@ -43,7 +43,7 @@ def scan(ll, settings):
   if len(blockaddrs) > 0:
     settings['NECESSARY_BLOCKADDRS'] = blockaddrs
 
-NUM_CHUNKS_PER_CORE = 5
+NUM_CHUNKS_PER_CORE = 1.25
 MIN_CHUNK_SIZE = 1024*1024
 MAX_CHUNK_SIZE = float(os.environ.get('EMSCRIPT_MAX_CHUNK_SIZE') or 'inf') # configuring this is just for debugging purposes
 
@@ -131,7 +131,7 @@ def emscript(infile, settings, outfile, libraries=[]):
   settings_file = temp_files.get('.txt').name
   def save_settings():
     global settings_text
-    settings_text = json.dumps(settings)
+    settings_text = json.dumps(settings, sort_keys=True)
     s = open(settings_file, 'w')
     s.write(settings_text)
     s.close()
@@ -160,12 +160,12 @@ def emscript(infile, settings, outfile, libraries=[]):
 
   # Phase 2 - func
 
-  cores = multiprocessing.cpu_count()
+  cores = int(os.environ.get('EMCC_CORES') or multiprocessing.cpu_count())
   assert cores >= 1
   if cores > 1:
-    intended_num_chunks = cores * NUM_CHUNKS_PER_CORE
+    intended_num_chunks = int(round(cores * NUM_CHUNKS_PER_CORE))
     chunk_size = max(MIN_CHUNK_SIZE, total_ll_size / intended_num_chunks)
-    chunk_size += 3*len(meta) # keep ratio of lots of function code to meta (expensive to process, and done in each parallel task)
+    chunk_size += 3*len(meta) + len(forwarded_data)/3 # keep ratio of lots of function code to meta (expensive to process, and done in each parallel task) and forwarded data (less expensive but potentially significant)
     chunk_size = min(MAX_CHUNK_SIZE, chunk_size)
   else:
     chunk_size = MAX_CHUNK_SIZE # if 1 core, just use the max chunk size
@@ -317,15 +317,12 @@ def emscript(infile, settings, outfile, libraries=[]):
       params = ','.join(['p%d' % p for p in range(len(sig)-1)])
       coercions = ';'.join(['p%d = %sp%d%s' % (p, '+' if sig[p+1] != 'i' else '', p, '' if sig[p+1] != 'i' else '|0') for p in range(len(sig)-1)]) + ';'
       ret = '' if sig[0] == 'v' else ('return %s0' % ('+' if sig[0] != 'i' else ''))
-      return ('function %s(%s) { %s abort(%d); %s };' % (bad, params, coercions, i, ret), raw.replace('[0,', '[' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0]', ',' + bad + ']').replace(',0]', ',' + bad + ']'))
+      return ('function %s(%s) { %s abort(%d); %s };' % (bad, params, coercions, i, ret), raw.replace('[0,', '[' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0]', ',' + bad + ']').replace(',0]', ',' + bad + ']').replace(',0\n', ',' + bad + '\n'))
     infos = [make_table(sig, raw) for sig, raw in last_forwarded_json['Functions']['tables'].iteritems()]
     function_tables_defs = '\n'.join([info[0] for info in infos] + [info[1] for info in infos])
 
     asm_setup = ''
-    maths = ['Math.' + func for func in ['floor', 'abs', 'sqrt', 'pow', 'cos', 'sin', 'tan', 'acos', 'asin', 'atan', 'atan2', 'exp', 'log', 'ceil']]
-    if settings['USE_MATH_IMUL']:
-      maths += ['Math.imul']
-      asm_setup += 'if (!Math.imul) Math.imul = function(x, y) { return (x*y)|0 }; // # not a real polyfill since semantics not identical, but close and fairly fast\n'
+    maths = ['Math.' + func for func in ['floor', 'abs', 'sqrt', 'pow', 'cos', 'sin', 'tan', 'acos', 'asin', 'atan', 'atan2', 'exp', 'log', 'ceil', 'imul']]
     fundamentals = ['Math', 'Int8Array', 'Int16Array', 'Int32Array', 'Uint8Array', 'Uint16Array', 'Uint32Array', 'Float32Array', 'Float64Array']
     math_envs = ['Runtime.bitshift64', 'Math.min'] # TODO: move min to maths
     asm_setup += '\n'.join(['var %s = %s;' % (f.replace('.', '_'), f) for f in math_envs])
diff --git a/src/analyzer.js b/src/analyzer.js
index adc615fb..c930231f 100644
--- a/src/analyzer.js
+++ b/src/analyzer.js
@@ -684,9 +684,9 @@ function analyzer(data, sidePass) {
                       params: [(signed && j + whole > sourceElements.length) ? signedKeepAlive : null],
                       type: 'i32',
                     };
-                    if (j == 0 && isUnsignedOp(value.op) && sourceBits < 32) {
+                    if (j == 0 && sourceBits < 32) {
                       // zext sign correction
-                      result.ident = makeSignOp(result.ident, 'i' + sourceBits, 'un', 1, 1);
+                      result.ident = makeSignOp(result.ident, 'i' + sourceBits, isUnsignedOp(value.op) ? 'un' : 're', 1, 1);
                     }
                     if (fraction != 0) {
                       var other = {
diff --git a/src/jsifier.js b/src/jsifier.js
index 71975b9d..ff43c8c6 100644
--- a/src/jsifier.js
+++ b/src/jsifier.js
@@ -505,9 +505,9 @@ function JSify(data, functionsOnly, givenFunctions) {
         item.JS = '';
       } else if (LibraryManager.library.hasOwnProperty(shortident)) {
         item.JS = addFromLibrary(shortident);
-      } else {
+      } else if (!LibraryManager.library.hasOwnProperty(shortident + '__inline')) {
         item.JS = 'var ' + item.ident + '; // stub for ' + item.ident;
-        if (WARN_ON_UNDEFINED_SYMBOLS) {
+        if (WARN_ON_UNDEFINED_SYMBOLS || ASM_JS) { // always warn on undefs in asm, since it breaks validation
           warn('Unresolved symbol: ' + item.ident);
         }
       }
@@ -1401,6 +1401,8 @@ function JSify(data, functionsOnly, givenFunctions) {
       if (ASM_JS) {
         assert(returnType.search(/\("'\[,/) == -1); // XXX need isFunctionType(type, out)
         callIdent = '(' + callIdent + ')&{{{ FTM_' + sig + ' }}}'; // the function table mask is set in emscripten.py
+      } else if (SAFE_DYNCALLS) {
+        callIdent = '(tempInt=' + callIdent + ',tempInt < 0 || tempInt >= FUNCTION_TABLE.length-1 ? abort("dyncall error") : tempInt)';
       }
       callIdent = Functions.getTable(sig) + '[' + callIdent + ']';
     }
@@ -1511,7 +1513,7 @@ function JSify(data, functionsOnly, givenFunctions) {
         print('// ASM_LIBRARY FUNCTIONS');
         function fix(f) { // fix indenting to not confuse js optimizer
           f = f.substr(f.indexOf('f')); // remove initial spaces before 'function'
-          f = f.substr(0, f.lastIndexOf('\n')+1); // remove spaces and last }
+          f = f.substr(0, f.lastIndexOf('\n')+1); // remove spaces and last }  XXX assumes function has multiple lines
           return f + '}'; // add unindented } to match function
         }
         print(asmLibraryFunctions.map(fix).join('\n'));
@@ -1567,9 +1569,11 @@ function JSify(data, functionsOnly, givenFunctions) {
     var shellParts = read(shellFile).split('{{BODY}}');
     print(shellParts[1]);
     // Print out some useful metadata (for additional optimizations later, like the eliminator)
-    print('// EMSCRIPTEN_GENERATED_FUNCTIONS: ' + JSON.stringify(keys(Functions.implementedFunctions).filter(function(func) {
-      return IGNORED_FUNCTIONS.indexOf(func.ident) < 0;
-    })) + '\n');
+    if (EMIT_GENERATED_FUNCTIONS) {
+      print('// EMSCRIPTEN_GENERATED_FUNCTIONS: ' + JSON.stringify(keys(Functions.implementedFunctions).filter(function(func) {
+        return IGNORED_FUNCTIONS.indexOf(func.ident) < 0;
+      })) + '\n');
+    }
 
     PassManager.serialize();
 
diff --git a/src/library.js b/src/library.js
index d8f98d73..d0f73fdd 100644
--- a/src/library.js
+++ b/src/library.js
@@ -4525,11 +4525,16 @@ LibraryManager.library = {
     return 0;
   },
 
+  memcmp__asm: 'true',
+  memcmp__sig: 'iiii',
   memcmp: function(p1, p2, num) {
-    for (var i = 0; i < num; i++) {
-      var v1 = {{{ makeGetValue('p1', 'i', 'i8', 0, 1) }}};
-      var v2 = {{{ makeGetValue('p2', 'i', 'i8', 0, 1) }}};
-      if (v1 != v2) return v1 > v2 ? 1 : -1;
+    p1 = p1|0; p2 = p2|0; num = num|0;
+    var i = 0, v1 = 0, v2 = 0;
+    while ((i|0) < (num|0)) {
+      var v1 = {{{ makeGetValueAsm('p1', 'i', 'i8', true) }}};
+      var v2 = {{{ makeGetValueAsm('p2', 'i', 'i8', true) }}};
+      if ((v1|0) != (v2|0)) return ((v1|0) > (v2|0) ? 1 : -1)|0;
+      i = (i+1)|0;
     }
     return 0;
   },
diff --git a/src/library_browser.js b/src/library_browser.js
index e9396d69..5b19a360 100644
--- a/src/library_browser.js
+++ b/src/library_browser.js
@@ -204,7 +204,12 @@ mergeInto(LibraryManager.library, {
       var ctx;
       try {
         if (useWebGL) {
-          ctx = canvas.getContext('experimental-webgl', { alpha: false });
+          ctx = canvas.getContext('experimental-webgl', {
+            alpha: false,
+#if GL_TESTING
+            preserveDrawingBuffer: true
+#endif
+          });
         } else {
           ctx = canvas.getContext('2d');
         }
diff --git a/src/library_gl.js b/src/library_gl.js
index 8fbe48ac..a20eccf6 100644
--- a/src/library_gl.js
+++ b/src/library_gl.js
@@ -996,6 +996,7 @@ var LibraryGL = {
     // VAO support
     vaos: [],
     currentVao: null,
+    enabledVertexAttribArrays: {}, // helps with vao cleanups
 
     init: function() {
       GLEmulation.fogColor = new Float32Array(4);
@@ -1410,12 +1411,14 @@ var LibraryGL = {
       var glEnableVertexAttribArray = _glEnableVertexAttribArray;
       _glEnableVertexAttribArray = function(index) {
         glEnableVertexAttribArray(index);
+        GLEmulation.enabledVertexAttribArrays[index] = 1;
         if (GLEmulation.currentVao) GLEmulation.currentVao.enabledVertexAttribArrays[index] = 1;
       };
 
       var glDisableVertexAttribArray = _glDisableVertexAttribArray;
       _glDisableVertexAttribArray = function(index) {
         glDisableVertexAttribArray(index);
+        delete GLEmulation.enabledVertexAttribArrays[index];
         if (GLEmulation.currentVao) delete GLEmulation.currentVao.enabledVertexAttribArrays[index];
       };
 
@@ -1489,6 +1492,9 @@ var LibraryGL = {
         case 'glIsFramebuffer': ret = {{{ Functions.getIndex('_glIsFramebuffer', true) }}}; break;
         case 'glCheckFramebufferStatus': ret = {{{ Functions.getIndex('_glCheckFramebufferStatus', true) }}}; break;
         case 'glRenderbufferStorage': ret = {{{ Functions.getIndex('_glRenderbufferStorage', true) }}}; break;
+        case 'glGenVertexArrays': ret = {{{ Functions.getIndex('_glGenVertexArrays', true) }}}; break;
+        case 'glDeleteVertexArrays': ret = {{{ Functions.getIndex('_glDeleteVertexArrays', true) }}}; break;
+        case 'glBindVertexArray': ret = {{{ Functions.getIndex('_glBindVertexArray', true) }}}; break;
       }
       if (!ret) Module.printErr('WARNING: getProcAddress failed for ' + name);
       return ret;
@@ -1912,27 +1918,35 @@ var LibraryGL = {
           }
 
           // If the array buffer is unchanged and the renderer as well, then we can avoid all the work here
-          // XXX We use some heuristics here, and this may not work in all cases. Try disabling this if you
-          // have odd glitches (by setting canSkip always to 0, or even cleaning up the renderer right
-          // after rendering)
+          // XXX We use some heuristics here, and this may not work in all cases. Try disabling GL_UNSAFE_OPTS if you
+          // have odd glitches
+#if GL_UNSAFE_OPTS
           var lastRenderer = GL.immediate.lastRenderer;
           var canSkip = this == lastRenderer &&
                         arrayBuffer == GL.immediate.lastArrayBuffer &&
                         (GL.currProgram || this.program) == GL.immediate.lastProgram &&
                         !GL.immediate.matricesModified;
           if (!canSkip && lastRenderer) lastRenderer.cleanup();
+#endif
           if (!GL.currArrayBuffer) {
             // Bind the array buffer and upload data after cleaning up the previous renderer
+#if GL_UNSAFE_OPTS
+            // Potentially unsafe, since lastArrayBuffer might not reflect the true array buffer in code that mixes immediate/non-immediate
             if (arrayBuffer != GL.immediate.lastArrayBuffer) {
+#endif
               Module.ctx.bindBuffer(Module.ctx.ARRAY_BUFFER, arrayBuffer);
+#if GL_UNSAFE_OPTS
             }
+#endif
             Module.ctx.bufferSubData(Module.ctx.ARRAY_BUFFER, start, GL.immediate.vertexData.subarray(start >> 2, end >> 2));
           }
+#if GL_UNSAFE_OPTS
           if (canSkip) return;
           GL.immediate.lastRenderer = this;
           GL.immediate.lastArrayBuffer = arrayBuffer;
           GL.immediate.lastProgram = GL.currProgram || this.program;
           GL.immediate.matricesModified = false;
+#endif
 
           if (!GL.currProgram) {
             Module.ctx.useProgram(this.program);
@@ -2008,9 +2022,11 @@ var LibraryGL = {
             Module.ctx.bindBuffer(Module.ctx.ARRAY_BUFFER, null);
           }
 
+#if GL_UNSAFE_OPTS
           GL.immediate.lastRenderer = null;
           GL.immediate.lastArrayBuffer = null;
           GL.immediate.lastProgram = null;
+#endif
           GL.immediate.matricesModified = true;
         }
       };
@@ -2255,6 +2271,10 @@ var LibraryGL = {
       if (emulatedElementArrayBuffer) {
         Module.ctx.bindBuffer(Module.ctx.ELEMENT_ARRAY_BUFFER, GL.buffers[GL.currElementArrayBuffer] || null);
       }
+
+#if GL_UNSAFE_OPTS == 0
+      renderer.cleanup();
+#endif
     }
   },
 
@@ -2489,9 +2509,11 @@ var LibraryGL = {
     if (disable && GL.immediate.enabledClientAttributes[attrib]) {
       GL.immediate.enabledClientAttributes[attrib] = false;
       GL.immediate.totalEnabledClientAttributes--;
+      if (GLEmulation.currentVao) delete GLEmulation.currentVao.enabledClientStates[cap];
     } else if (!disable && !GL.immediate.enabledClientAttributes[attrib]) {
       GL.immediate.enabledClientAttributes[attrib] = true;
       GL.immediate.totalEnabledClientAttributes++;
+      if (GLEmulation.currentVao) GLEmulation.currentVao.enabledClientStates[cap] = 1;
     }
     GL.immediate.modifiedClientAttributes = true;
   },
@@ -2520,6 +2542,7 @@ var LibraryGL = {
 
   // Vertex array object (VAO) support. TODO: when the WebGL extension is popular, use that and remove this code and GL.vaos
   glGenVertexArrays__deps: ['$GLEMulation'],
+  glGenVertexArrays__sig: ['vii'],
   glGenVertexArrays: function(n, vaos) {
     for (var i = 0; i < n; i++) {
       var id = GL.getNewId(GLEmulation.vaos); 
@@ -2529,10 +2552,12 @@ var LibraryGL = {
         elementArrayBuffer: 0,
         enabledVertexAttribArrays: {},
         vertexAttribPointers: {},
+        enabledClientStates: {},
       };
       {{{ makeSetValue('vaos', 'i*4', 'id', 'i32') }}};
     }
   },
+  glDeleteVertexArrays__sig: ['vii'],
   glDeleteVertexArrays: function(n, vaos) {
     for (var i = 0; i < n; i++) {
       var id = {{{ makeGetValue('vaos', 'i*4', 'i32') }}};
@@ -2540,10 +2565,22 @@ var LibraryGL = {
       if (GLEmulation.currentVao && GLEmulation.currentVao.id == id) GLEmulation.currentVao = null;
     }
   },
+  glBindVertexArray__sig: ['vi'],
   glBindVertexArray: function(vao) {
+    // undo vao-related things, wipe the slate clean, both for vao of 0 or an actual vao
+    GLEmulation.currentVao = null; // make sure the commands we run here are not recorded
+    if (GL.immediate.lastRenderer) GL.immediate.lastRenderer.cleanup();
+    _glBindBuffer(Module.ctx.ARRAY_BUFFER, 0); // XXX if one was there before we were bound?
+    _glBindBuffer(Module.ctx.ELEMENT_ARRAY_BUFFER, 0);
+    for (var vaa in GLEmulation.enabledVertexAttribArrays) {
+      Module.ctx.disableVertexAttribArray(vaa);
+    }
+    GLEmulation.enabledVertexAttribArrays = {};
+    GL.immediate.enabledClientAttributes = [0, 0];
+    GL.immediate.totalEnabledClientAttributes = 0;
+    GL.immediate.modifiedClientAttributes = true;
     if (vao) {
       // replay vao
-      if (GLEmulation.currentVao) _glBindVertexArray(0); // flush the old one out
       var info = GLEmulation.vaos[vao];
       _glBindBuffer(Module.ctx.ARRAY_BUFFER, info.arrayBuffer); // XXX overwrite current binding?
       _glBindBuffer(Module.ctx.ELEMENT_ARRAY_BUFFER, info.elementArrayBuffer);
@@ -2553,16 +2590,10 @@ var LibraryGL = {
       for (var vaa in info.vertexAttribPointers) {
         _glVertexAttribPointer.apply(null, info.vertexAttribPointers[vaa]);
       }
-      GLEmulation.currentVao = info; // set currentVao last, so the commands we ran here were not recorded
-    } else if (GLEmulation.currentVao) {
-      // undo vao
-      var info = GLEmulation.currentVao;
-      GLEmulation.currentVao = null; // set currentVao first, so the commands we run here are not recorded
-      _glBindBuffer(Module.ctx.ARRAY_BUFFER, 0); // XXX if one was there before we were bound?
-      _glBindBuffer(Module.ctx.ELEMENT_ARRAY_BUFFER, 0);
-      for (var vaa in info.enabledVertexAttribArrays) {
-        _glDisableVertexAttribArray(vaa);
+      for (var attrib in info.enabledClientStates) {
+        _glEnableClientState(attrib|0);
       }
+      GLEmulation.currentVao = info; // set currentVao last, so the commands we ran here were not recorded
     }
   },
 
@@ -2787,6 +2818,7 @@ var LibraryGL = {
   glGenVertexArraysOES: 'glGenVertexArrays',
   glDeleteVertexArraysOES: 'glDeleteVertexArrays',
   glBindVertexArrayOES: 'glBindVertexArray',
+  glFramebufferTexture2DOES: 'glFramebufferTexture2D'
 };
 
 // Simple pass-through functions. Starred ones have return values. [X] ones have X in the C name but not in the JS name
diff --git a/src/modules.js b/src/modules.js
index 695abbe7..7f8a959b 100644
--- a/src/modules.js
+++ b/src/modules.js
@@ -330,6 +330,22 @@ var Functions = {
           }
         }
       }
+      if (SAFE_DYNCALLS) {
+        assert(!ASM_JS, 'cannot emit safe dyncalls in asm');
+        for (var j = 0; j < table.length; j++) {
+          if (table[j] == 0) {
+            table[j] = "function() { abort('dyncall error') }";
+          }
+        }
+      }
+      if (table.length > 20) {
+        // add some newlines in the table, for readability
+        var j = 10;
+        while (j+10 < table.length) {
+          table[j] += '\n';
+          j += 10;
+        }
+      }
       var indices = table.toString().replace('"', '');
       if (BUILD_AS_SHARED_LIB) {
         // Shared libraries reuse the parent's function table.
diff --git a/src/parseTools.js b/src/parseTools.js
index ca9ad40a..6e0d6e32 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -103,6 +103,11 @@ function isNiceIdent(ident, loose) {
   }
 }
 
+function isJSVar(ident) {
+  return /^\(?[$_]?[\w$_\d ]*\)+$/.test(ident);
+
+}
+
 function isStructPointerType(type) {
   // This test is necessary for clang - in llvm-gcc, we
   // could check for %struct. The downside is that %1 can
@@ -988,7 +993,8 @@ function getHeapOffset(offset, type, forceAsm) {
     if (shifts != 0) {
       return '(' + offset + '>>' + shifts + ')';
     } else {
-      return offset;
+      // we need to guard against overflows here, HEAP[U]8 expects a guaranteed int
+      return isJSVar(offset) ? offset : '(' + offset + '|0)';
     }
   }
 }
@@ -1038,20 +1044,6 @@ function asmCoercion(value, type, signedness) {
   }
 }
 
-var TWO_TWENTY = Math.pow(2, 20);
-
-function asmMultiplyI32(a, b) {
-  // special-case: there is no integer multiply in asm, because there is no true integer
-  // multiply in JS. While we wait for Math.imul, do double multiply
-  if ((isNumber(a) && Math.abs(a) < TWO_TWENTY) || (isNumber(b) && Math.abs(b) < TWO_TWENTY)) {
-    return '(((' + a + ')*(' + b + '))&-1)'; // small enough to emit directly as a multiply
-  }
-  if (USE_MATH_IMUL) {
-    return 'Math.imul(' + a + ',' + b + ')';
-  }
-  return '(~~(+((' + a + ')|0) * +((' + b + ')|0)))';
-}
-
 function asmFloatToInt(x) {
   return '(~~(' + x + '))';
 }
@@ -1145,8 +1137,8 @@ function makeGetValue(ptr, pos, type, noNeedFirst, unsigned, ignore, align, noSa
   }
 }
 
-function makeGetValueAsm(ptr, pos, type) {
-  return makeGetValue(ptr, pos, type, null, null, null, null, null, true);
+function makeGetValueAsm(ptr, pos, type, unsigned) {
+  return makeGetValue(ptr, pos, type, null, unsigned, null, null, null, true);
 }
 
 function indexizeFunctions(value, type) {
@@ -1364,9 +1356,11 @@ function makeHEAPView(which, start, end) {
 var PLUS_MUL = set('+', '*');
 var MUL_DIV = set('*', '/');
 var PLUS_MINUS = set('+', '-');
+var TWO_TWENTY = Math.pow(2, 20);
 
 // Given two values and an operation, returns the result of that operation.
 // Tries to do as much as possible at compile time.
+// Leaves overflows etc. unhandled, *except* for integer multiply, in order to be efficient with Math.imul
 function getFastValue(a, op, b, type) {
   a = a.toString();
   b = b.toString();
@@ -1402,8 +1396,14 @@ function getFastValue(a, op, b, type) {
           return '(' + a + '<<' + shifts + ')';
         }
       }
-      if (ASM_JS && !(type in Runtime.FLOAT_TYPES)) {
-        return asmMultiplyI32(a, b); // unoptimized multiply, do it using asm.js's special multiply operation
+      if (!(type in Runtime.FLOAT_TYPES)) {
+        // if guaranteed small enough to not overflow into a double, do a normal multiply
+        var bits = getBits(type) || 32; // default is 32-bit multiply for things like getelementptr indexes
+        // Note that we can emit simple multiple in non-asm.js mode, but asm.js will not parse "16-bit" multiple, so must do imul there
+        if ((isNumber(a) && Math.abs(a) < TWO_TWENTY) || (isNumber(b) && Math.abs(b) < TWO_TWENTY) || (bits < 32 && !ASM_JS)) {
+          return '(((' + a + ')*(' + b + '))&' + ((Math.pow(2, bits)-1)|0) + ')'; // keep a non-eliminatable coercion directly on this
+        }
+        return 'Math.imul(' + a + ',' + b + ')';
       }
     } else {
       if (a == '0') {
@@ -1548,7 +1548,7 @@ function makePointer(slab, pos, allocator, type, ptr) {
     var ret = '';
     var index = 0;
     while (index < array.length) {
-      ret = (ret ? ret + '.concat(' : '') + '[' + array.slice(index, index + chunkSize).map(JSON.stringify) + ']' + (ret ? ')' : '');
+      ret = (ret ? ret + '.concat(' : '') + '[' + array.slice(index, index + chunkSize).map(JSON.stringify) + ']' + (ret ? ')\n' : '');
       index += chunkSize;
     }
     return ret;
@@ -2130,14 +2130,7 @@ function processMathop(item) {
     case 'add': return handleOverflow(getFastValue(idents[0], '+', idents[1], item.type), bits);
     case 'sub': return handleOverflow(getFastValue(idents[0], '-', idents[1], item.type), bits);
     case 'sdiv': case 'udiv': return makeRounding(getFastValue(idents[0], '/', idents[1], item.type), bits, op[0] === 's');
-    case 'mul': {
-      if (bits == 32 && PRECISE_I32_MUL) {
-        Types.preciseI64MathUsed = true;
-        return '(i64Math' + (ASM_JS ? '_' : '.') + 'multiply(' + asmCoercion(idents[0], 'i32') + ',0,' + asmCoercion(idents[1], 'i32') + ',0),' + makeGetValue('tempDoublePtr', 0, 'i32') + ')';
-      } else {
-        return '((' +getFastValue(idents[0], '*', idents[1], item.type) + ')&-1)'; // force a non-eliminatable coercion here, to prevent a double result from leaking
-      }
-    }
+    case 'mul': return getFastValue(idents[0], '*', idents[1], item.type); // overflow handling is already done in getFastValue for '*'
     case 'urem': case 'srem': return getFastValue(idents[0], '%', idents[1], item.type);
     case 'or': {
       if (bits > 32) {
diff --git a/src/preamble.js b/src/preamble.js
index 503b09f1..a7731e7f 100644
--- a/src/preamble.js
+++ b/src/preamble.js
@@ -742,6 +742,20 @@ Module['writeArrayToMemory'] = writeArrayToMemory;
 {{{ unSign }}}
 {{{ reSign }}}
 
+#if PRECISE_I32_MUL
+if (!Math.imul) Math.imul = function(a, b) {
+  var ah  = a >>> 16;
+  var al = a & 0xffff;
+  var bh  = b >>> 16;
+  var bl = b & 0xffff;
+  return (al*bl + ((ah*bl + al*bh) << 16))|0;
+};
+#else
+Math.imul = function(a, b) {
+  return (a*b)|0; // fast but imprecise
+};
+#endif
+
 // A counter of dependencies for calling run(). If we need to
 // do asynchronous work before running, increment this and
 // decrement it. Incrementing must happen in a place like
diff --git a/src/settings.js b/src/settings.js
index d036822f..308afddc 100644
--- a/src/settings.js
+++ b/src/settings.js
@@ -92,13 +92,9 @@ var PRECISE_I64_MATH = 1; // If enabled, i64 addition etc. is emulated - which i
                           // that we can't know at compile time that 64-bit math is needed. For example, if you
                           // print 64-bit values with printf, but never add them, we can't know at compile time
                           // and you need to set this to 2.
-var PRECISE_I32_MUL = 0; // If enabled, i64 math is done in i32 multiplication. This is necessary if the values
-                         // exceed the JS double-integer limit of ~52 bits. This option can normally be disabled
-                         // because generally i32 multiplication works ok without it, and enabling it has a big
-                         // impact on performance.
-                         // Note that you can hand-optimize your code to avoid the need for this: If you do
-                         // multiplications that actually need 64-bit precision inside 64-bit values, things
-                         // will work properly. (Unless the LLVM optimizer turns them into 32-bit values?)
+var PRECISE_I32_MUL = 1; // If enabled, i32 multiplication is done with full precision, which means it is
+                         // correct even if the value exceeds the JS double-integer limit of ~52 bits (otherwise,
+                         // rounding will occur above that range).
 
 var CLOSURE_ANNOTATIONS = 0; // If set, the generated code will be annotated for the closure
                              // compiler. This potentially lets closure optimize the code better.
@@ -132,6 +128,8 @@ var SAFE_HEAP = 0; // Check each write to the heap, for example, this will give
                    // that 3 is the option you usually want here.
 var SAFE_HEAP_LOG = 0; // Log out all SAFE_HEAP operations
 
+var SAFE_DYNCALLS = 0; // Show stack traces on missing function pointer/virtual method calls
+
 var ASM_HEAP_LOG = 0; // Simple heap logging, like SAFE_HEAP_LOG but cheaper, and in asm.js
 
 var CORRUPTION_CHECK = 0; // When enabled, will emit a buffer area at the beginning and
@@