13 files changed, 1860 insertions, 594 deletions
diff --git a/AUTHORS b/AUTHORS
index 2b2b21ed..b03bfe7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -110,3 +110,4 @@ a license to everyone to use it as detailed in LICENSE.)
 * John Vilk <jvilk@cs.umass.edu>
 * Daniel Baulig <dbaulig@fb.com> (copyright owned by Facebook, Inc.)
 * Lu Wang <coolwanglu@gmail.com>
+* Heidi Pan <heidi.pan@intel.com> (copyright owned by Intel)
diff --git a/emmake b/emmake
index 18e6afa9..e8f34f41 100755
--- a/emmake
+++ b/emmake
@@ -6,7 +6,7 @@ the environment variables to use emcc and so forth. Usage:
 
   emmake make [FLAGS]
 
-Not that if you ran configure with emconfigure, then
+Note that if you ran configure with emconfigure, then
 the environment variables have already been detected
 and set. This script is useful if you have no configure
 step, and your Makefile uses the environment vars
diff --git a/src/intertyper.js b/src/intertyper.js
index 940c677f..15f619a1 100644
--- a/src/intertyper.js
+++ b/src/intertyper.js
@@ -524,6 +524,27 @@ function intertyper(lines, sidePass, baseLineNums) {
             }
           });
         }
+      } else if (ident == '_llvm_used') {
+        var chunk = item.tokens[1].tokens;
+        var funcs = [];
+        var part = [];
+
+        for (var i = 0; i < chunk.length; i++) {
+          if (chunk[i].text == ',') {
+            var call = parseLLVMFunctionCall(part);
+            EXPORTED_FUNCTIONS[call.ident] = 0;
+            part = [];
+          } else {
+            part.push(chunk[i]);
+          }
+        }
+        if (part.length > 0) {
+          var call = parseLLVMFunctionCall(part);
+          EXPORTED_FUNCTIONS[call.ident] = 0;
+        }
+
+        ret.type = 'i32';
+        ret.value = { intertype: 'value', ident: '0', value: '0', type: ret.type };
       } else if (!external) {
         if (item.tokens[1] && item.tokens[1].text != ';') {
           if (item.tokens[1].text == 'c') {
@@ -538,6 +559,7 @@ function intertyper(lines, sidePass, baseLineNums) {
           ret.value = { intertype: 'value', ident: '0', value: '0', type: ret.type };
         }
       }
+
       return ret;
     }
   }
diff --git a/src/jsifier.js b/src/jsifier.js
index cb753e57..fb6c5ba8 100644
--- a/src/jsifier.js
+++ b/src/jsifier.js
@@ -1373,8 +1373,9 @@ function JSify(data, functionsOnly, givenFunctions) {
   function insertelementHandler(item) {
     var base = getVectorBaseType(item.type);
     var ident = ensureVector(item.ident, base);
+    var laneOp = ((base == 'float') ? 'SIMD.float32x4.with' : 'SIMD.int32x4.with');
     //return ident + '.with' + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + finalizeLLVMParameter(item.value) + ')';
-    return 'SIMD.with' + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + ident + ',' + finalizeLLVMParameter(item.value) + ')';
+    return laneOp + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + ident + ',' + finalizeLLVMParameter(item.value) + ')';
   }
   function extractelementHandler(item) {
     var base = getVectorBaseType(item.type);
diff --git a/src/library.js b/src/library.js
index 128bb211..faca945c 100644
--- a/src/library.js
+++ b/src/library.js
@@ -8736,8 +8736,72 @@ LibraryManager.library = {
   // emscripten vector ops
   //============================
 
-  emscripten_float32x4_signmask__inline: function(x) {
-    return x + '.signMask()';
+  emscripten_float32x4_signmask__inline: function(a) {
+    return 'SIMD.float32x4.bitsToInt32x4(' + a + ').signMask';
+  },
+  
+  emscripten_float32x4_min__inline: function(a, b) {
+    return 'SIMD.float32x4.min(' + a + ', ' + b + ')';
+  },
+  
+  emscripten_float32x4_max__inline: function(a, b) {
+    return 'SIMD.float32x4.max(' + a + ', ' + b + ')';
+  },
+  
+  emscripten_float32x4_sqrt__inline: function(a) {
+    return 'SIMD.float32x4.sqrt(' + a + ')';
+  },
+  
+  emscripten_float32x4_lessThan__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.lessThan(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_lessThanOrEqual__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.lessThanOrEqual(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_equal__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.equal(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_greaterThanOrEqual__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.greaterThanOrEqual(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_greaterThan__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.greaterThan(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_and__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.and(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_andNot__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.and(SIMD.int32x4.not(SIMD.float32x4.bitsToInt32x4(' + a + ')), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_or__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.or(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_xor__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.xor(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_int32x4_bitsToFloat32x4__inline: function(a) {
+      return 'SIMD.int32x4.bitsToFloat32x4(' + a + ')';
+  },
+  
+  emscripten_int32x4_toFloat32x4__inline: function(a) {
+      return 'SIMD.int32x4.toFloat32x4(' + a + ')';
+  },
+  
+  emscripten_float32x4_bitsToInt32x4__inline: function(a) {
+      return 'SIMD.float32x4.bitsToInt32x4(' + a + ')';
+  },
+  
+  emscripten_float32x4_toInt32x4__inline: function(a) {
+      return 'SIMD.float32x4.toInt32x4(' + a + ')';
   },
 
   //============================
diff --git a/src/library_gl.js b/src/library_gl.js
index afd36197..95da8f09 100644
--- a/src/library_gl.js
+++ b/src/library_gl.js
@@ -288,6 +288,22 @@ var LibraryGL = {
       }
     },
 
+#if GL_FFP_ONLY
+    enabledClientAttribIndices: [],
+    enableVertexAttribArray: function enableVertexAttribArray(index) {
+      if (!GL.enabledClientAttribIndices[index]) {
+        GL.enabledClientAttribIndices[index] = true;
+        Module.ctx.enableVertexAttribArray(index);
+      }
+    },
+    disableVertexAttribArray: function disableVertexAttribArray(index) {
+      if (GL.enabledClientAttribIndices[index]) {
+        GL.enabledClientAttribIndices[index] = false;
+        Module.ctx.disableVertexAttribArray(index);
+      }
+    },
+#endif
+
 #if FULL_ES2
     calcBufLength: function calcBufLength(size, type, stride, count) {
       if (stride > 0) {
@@ -568,6 +584,12 @@ var LibraryGL = {
         var formats = Module.ctx.getParameter(0x86A3 /*GL_COMPRESSED_TEXTURE_FORMATS*/);
         {{{ makeSetValue('p', '0', 'formats.length', 'i32') }}};
         return;
+      case 0x8B9A: // GL_IMPLEMENTATION_COLOR_READ_TYPE
+        {{{ makeSetValue('p', '0', '0x1401', 'i32') }}}; // GL_UNSIGNED_BYTE
+        return;
+      case 0x8B9B: // GL_IMPLEMENTATION_COLOR_READ_FORMAT
+        {{{ makeSetValue('p', '0', '0x1908', 'i32') }}}; // GL_RGBA
+        return;
     }
     var result = Module.ctx.getParameter(name_);
     switch (typeof(result)) {
@@ -641,6 +663,12 @@ var LibraryGL = {
         var formats = Module.ctx.getParameter(0x86A3 /*GL_COMPRESSED_TEXTURE_FORMATS*/);
         {{{ makeSetValue('p', '0', 'formats.length', 'float') }}};
         return;
+      case 0x8B9A: // GL_IMPLEMENTATION_COLOR_READ_TYPE
+        {{{ makeSetValue('p', '0', '0x1401', 'i32') }}}; // GL_UNSIGNED_BYTE
+        return;
+      case 0x8B9B: // GL_IMPLEMENTATION_COLOR_READ_FORMAT
+        {{{ makeSetValue('p', '0', '0x1908', 'i32') }}}; // GL_RGBA
+        return;
     }
     
     var result = Module.ctx.getParameter(name_);
@@ -1808,7 +1836,7 @@ var LibraryGL = {
 
       // Add some emulation workarounds
       Module.printErr('WARNING: using emscripten GL emulation. This is a collection of limited workarounds, do not expect it to work.');
-#if GL_UNSAFE_OPTS == 0
+#if GL_UNSAFE_OPTS == 1
       Module.printErr('WARNING: using emscripten GL emulation unsafe opts. If weirdness happens, try -s GL_UNSAFE_OPTS=0');
 #endif
 
@@ -2149,7 +2177,10 @@ var LibraryGL = {
           }
         }
 #endif
-        GL.currProgram = program;
+        if (GL.currProgram != program) {
+          GL.currentRenderer = null; // This changes the FFP emulation shader program, need to recompute that.
+          GL.currProgram = program;
+        }
         glUseProgram(program);
       }
 
@@ -2689,32 +2720,85 @@ var LibraryGL = {
           GL_SRC_ALPHA
         ];
 
-        this.traverseState = function CTexEnv_traverseState(keyView) {
-          keyView.next(this.mode);
-          keyView.next(this.colorCombiner);
-          keyView.next(this.alphaCombiner);
-          keyView.next(this.colorCombiner);
-          keyView.next(this.alphaScale);
-          keyView.next(this.envColor[0]);
-          keyView.next(this.envColor[1]);
-          keyView.next(this.envColor[2]);
-          keyView.next(this.envColor[3]);
-
-          keyView.next(this.colorSrc[0]);
-          keyView.next(this.colorSrc[1]);
-          keyView.next(this.colorSrc[2]);
-
-          keyView.next(this.alphaSrc[0]);
-          keyView.next(this.alphaSrc[1]);
-          keyView.next(this.alphaSrc[2]);
-
-          keyView.next(this.colorOp[0]);
-          keyView.next(this.colorOp[1]);
-          keyView.next(this.colorOp[2]);
-
-          keyView.next(this.alphaOp[0]);
-          keyView.next(this.alphaOp[1]);
-          keyView.next(this.alphaOp[2]);
+        // Map GLenums to small values to efficiently pack the enums to bits for tighter access.
+        this.traverseKey = {
+          // mode
+          0x1E01 /* GL_REPLACE */: 0,
+          0x2100 /* GL_MODULATE */: 1,
+          0x0104 /* GL_ADD */: 2,
+          0x0BE2 /* GL_BLEND */: 3,
+          0x2101 /* GL_DECAL */: 4,
+          0x8570 /* GL_COMBINE */: 5,
+
+          // additional color and alpha combiners
+          0x84E7 /* GL_SUBTRACT */: 3,
+          0x8575 /* GL_INTERPOLATE */: 4,
+
+          // color and alpha src
+          0x1702 /* GL_TEXTURE */: 0,
+          0x8576 /* GL_CONSTANT */: 1,
+          0x8577 /* GL_PRIMARY_COLOR */: 2,
+          0x8578 /* GL_PREVIOUS */: 3,
+
+          // color and alpha op
+          0x0300 /* GL_SRC_COLOR */: 0,
+          0x0301 /* GL_ONE_MINUS_SRC_COLOR */: 1,
+          0x0302 /* GL_SRC_ALPHA */: 2,
+          0x0300 /* GL_ONE_MINUS_SRC_ALPHA */: 3
+        };
+
+        // The tuple (key0,key1,key2) uniquely identifies the state of the variables in CTexEnv.
+        // -1 on key0 denotes 'the whole cached key is dirty'
+        this.key0 = -1;
+        this.key1 = 0;
+        this.key2 = 0;
+
+        this.computeKey0 = function() {
+          var k = this.traverseKey;
+          var key = k[this.mode] * 1638400; // 6 distinct values.
+          key += k[this.colorCombiner] * 327680; // 5 distinct values.
+          key += k[this.alphaCombiner] * 65536; // 5 distinct values.
+          // The above three fields have 6*5*5=150 distinct values -> 8 bits.
+          key += (this.colorScale-1) * 16384; // 10 bits used.
+          key += (this.alphaScale-1) * 4096; // 12 bits used.
+          key += k[this.colorSrc[0]] * 1024; // 14
+          key += k[this.colorSrc[1]] * 256; // 16
+          key += k[this.colorSrc[2]] * 64; // 18
+          key += k[this.alphaSrc[0]] * 16; // 20
+          key += k[this.alphaSrc[1]] * 4; // 22
+          key += k[this.alphaSrc[2]]; // 24 bits used total.
+          return key;
+        }
+        this.computeKey1 = function() {
+          var k = this.traverseKey;
+          key = k[this.colorOp[0]] * 4096;
+          key += k[this.colorOp[1]] * 1024;             
+          key += k[this.colorOp[2]] * 256;
+          key += k[this.alphaOp[0]] * 16;
+          key += k[this.alphaOp[1]] * 4;
+          key += k[this.alphaOp[2]];
+          return key;            
+        }
+        // TODO: remove this. The color should not be part of the key!
+        this.computeKey2 = function() {
+          return this.envColor[0] * 16777216 + this.envColor[1] * 65536 + this.envColor[2] * 256 + 1 + this.envColor[3];
+        }
+        this.recomputeKey = function() {
+          this.key0 = this.computeKey0();
+          this.key1 = this.computeKey1();
+          this.key2 = this.computeKey2();
+        }
+        this.invalidateKey = function() {
+          this.key0 = -1; // The key of this texture unit must be recomputed when rendering the next time.
+          GL.immediate.currentRenderer = null; // The currently used renderer must be re-evaluated at next render.
+        }
+        this.traverseState = function(keyView) {
+          if (this.key0 == -1) {
+            this.recomputeKey();
+          }
+          keyView.next(this.key0);
+          keyView.next(this.key1);
+          keyView.next(this.key2);
         };
       }
 
@@ -3076,16 +3160,28 @@ var LibraryGL = {
           var cur = getCurTexUnit();
           switch (cap) {
             case GL_TEXTURE_1D:
-              cur.enabled_tex1D = true;
+              if (!cur.enabled_tex1D) {
+                GL.immediate.currentRenderer = null; // Renderer state changed, and must be recreated or looked up again.
+                cur.enabled_tex1D = true;
+              }
               break;
             case GL_TEXTURE_2D:
-              cur.enabled_tex2D = true;
+              if (!cur.enabled_tex2D) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_tex2D = true;
+              }
               break;
             case GL_TEXTURE_3D:
-              cur.enabled_tex3D = true;
+              if (!cur.enabled_tex3D) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_tex3D = true;
+              }
               break;
             case GL_TEXTURE_CUBE_MAP:
-              cur.enabled_texCube = true;
+              if (!cur.enabled_texCube) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_texCube = true;
+              }
               break;
           }
         },
@@ -3094,16 +3190,28 @@ var LibraryGL = {
           var cur = getCurTexUnit();
           switch (cap) {
             case GL_TEXTURE_1D:
-              cur.enabled_tex1D = false;
+              if (cur.enabled_tex1D) {
+                GL.immediate.currentRenderer = null; // Renderer state changed, and must be recreated or looked up again.
+                cur.enabled_tex1D = false;
+              }
               break;
             case GL_TEXTURE_2D:
-              cur.enabled_tex2D = false;
+              if (cur.enabled_tex2D) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_tex2D = false;
+              }
               break;
             case GL_TEXTURE_3D:
-              cur.enabled_tex3D = false;
+              if (cur.enabled_tex3D) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_tex3D = false;
+              }
               break;
             case GL_TEXTURE_CUBE_MAP:
-              cur.enabled_texCube = false;
+              if (cur.enabled_texCube) {
+                GL.immediate.currentRenderer = null;
+                cur.enabled_texCube = false;
+              }
               break;
           }
         },
@@ -3115,10 +3223,16 @@ var LibraryGL = {
           var env = getCurTexUnit().env;
           switch (pname) {
             case GL_RGB_SCALE:
-              env.colorScale = param;
+              if (env.colorScale != param) {
+                env.invalidateKey(); // We changed FFP emulation renderer state.
+                env.colorScale = param;
+              }
               break;
             case GL_ALPHA_SCALE:
-              env.alphaScale = param;
+              if (env.alphaScale != param) {
+                env.invalidateKey();
+                env.alphaScale = param;
+              }
               break;
 
             default:
@@ -3133,61 +3247,112 @@ var LibraryGL = {
           var env = getCurTexUnit().env;
           switch (pname) {
             case GL_TEXTURE_ENV_MODE:
-              env.mode = param;
+              if (env.mode != param) {
+                env.invalidateKey(); // We changed FFP emulation renderer state.
+                env.mode = param;
+              }
               break;
 
             case GL_COMBINE_RGB:
-              env.colorCombiner = param;
+              if (env.colorCombiner != param) {
+                env.invalidateKey();
+                env.colorCombiner = param;
+              }
               break;
             case GL_COMBINE_ALPHA:
-              env.alphaCombiner = param;
+              if (env.alphaCombiner != param) {
+                env.invalidateKey();
+                env.alphaCombiner = param;
+              }
               break;
 
             case GL_SRC0_RGB:
-              env.colorSrc[0] = param;
+              if (env.colorSrc[0] != param) {
+                env.invalidateKey();
+                env.colorSrc[0] = param;
+              }
               break;
             case GL_SRC1_RGB:
-              env.colorSrc[1] = param;
+              if (env.colorSrc[1] != param) {
+                env.invalidateKey();
+                env.colorSrc[1] = param;
+              }
               break;
             case GL_SRC2_RGB:
-              env.colorSrc[2] = param;
+              if (env.colorSrc[2] != param) {
+                env.invalidateKey();
+                env.colorSrc[2] = param;
+              }
               break;
 
             case GL_SRC0_ALPHA:
-              env.alphaSrc[0] = param;
+              if (env.alphaSrc[0] != param) {
+                env.invalidateKey();
+                env.alphaSrc[0] = param;
+              }
               break;
             case GL_SRC1_ALPHA:
-              env.alphaSrc[1] = param;
+              if (env.alphaSrc[1] != param) {
+                env.invalidateKey();
+                env.alphaSrc[1] = param;
+              }
               break;
             case GL_SRC2_ALPHA:
-              env.alphaSrc[2] = param;
+              if (env.alphaSrc[2] != param) {
+                env.invalidateKey();
+                env.alphaSrc[2] = param;
+              }
               break;
 
             case GL_OPERAND0_RGB:
-              env.colorOp[0] = param;
+              if (env.colorOp[0] != param) {
+                env.invalidateKey();
+                env.colorOp[0] = param;
+              }
               break;
             case GL_OPERAND1_RGB:
-              env.colorOp[1] = param;
+              if (env.colorOp[1] != param) {
+                env.invalidateKey();
+                env.colorOp[1] = param;
+              }
               break;
             case GL_OPERAND2_RGB:
-              env.colorOp[2] = param;
+              if (env.colorOp[2] != param) {
+                env.invalidateKey();
+                env.colorOp[2] = param;
+              }
               break;
 
             case GL_OPERAND0_ALPHA:
-              env.alphaOp[0] = param;
+              if (env.alphaOp[0] != param) {
+                env.invalidateKey();
+                env.alphaOp[0] = param;
+              }
               break;
             case GL_OPERAND1_ALPHA:
-              env.alphaOp[1] = param;
+              if (env.alphaOp[1] != param) {
+                env.invalidateKey();
+                env.alphaOp[1] = param;
+              }
               break;
             case GL_OPERAND2_ALPHA:
-              env.alphaOp[2] = param;
+              if (env.alphaOp[2] != param) {
+                env.invalidateKey();
+                env.alphaOp[2] = param;
+              }
               break;
 
             case GL_RGB_SCALE:
-              env.colorScale = param;
+              if (env.colorScale != param) {
+                env.invalidateKey();
+                env.colorScale = param;
+              }
               break;
             case GL_ALPHA_SCALE:
-              env.alphaScale = param;
+              if (env.alphaScale != param) {
+                env.invalidateKey();
+                env.alphaScale = param;
+              }
               break;
 
             default:
@@ -3203,7 +3368,10 @@ var LibraryGL = {
             case GL_TEXTURE_ENV_COLOR: {
               for (var i = 0; i < 4; i++) {
                 var param = {{{ makeGetValue('params', 'i*4', 'float') }}};
-                env.envColor[i] = param;
+                if (env.envColor[i] != param) {
+                  env.invalidateKey(); // We changed FFP emulation renderer state.
+                  env.envColor[i] = param;
+                }
               }
               break
             }
@@ -3243,26 +3411,21 @@ var LibraryGL = {
     NORMAL: 1,
     COLOR: 2,
     TEXTURE0: 3,
-    TEXTURE1: 4,
-    TEXTURE2: 5,
-    TEXTURE3: 6,
-    TEXTURE4: 7,
-    TEXTURE5: 8,
-    TEXTURE6: 9,
-    NUM_ATTRIBUTES: 10, // Overwritten in init().
-    MAX_TEXTURES: 7,    // Overwritten in init().
+    NUM_ATTRIBUTES: -1, // Initialized in GL emulation init().
+    MAX_TEXTURES: -1,   // Initialized in GL emulation init().
 
     totalEnabledClientAttributes: 0,
     enabledClientAttributes: [0, 0],
     clientAttributes: [], // raw data, including possible unneeded ones
     liveClientAttributes: [], // the ones actually alive in the current computation, sorted
+    currentRenderer: null, // Caches the currently active FFP emulation renderer, so that it does not have to be re-looked up unless relevant state changes.
     modifiedClientAttributes: false,
     clientActiveTexture: 0,
     clientColor: null,
     usedTexUnitList: [],
     fixedFunctionProgram: null,
 
-    setClientAttribute: function(name, size, type, stride, pointer) {
+    setClientAttribute: function setClientAttribute(name, size, type, stride, pointer) {
       var attrib = this.clientAttributes[name];
       if (!attrib) {
         for (var i = 0; i <= name; i++) { // keep flat
@@ -3289,7 +3452,7 @@ var LibraryGL = {
     },
 
     // Renderers
-    addRendererComponent: function(name, size, type) {
+    addRendererComponent: function addRendererComponent(name, size, type) {
       if (!this.rendererComponents[name]) {
         this.rendererComponents[name] = 1;
 #if ASSERTIONS
@@ -3305,13 +3468,18 @@ var LibraryGL = {
       }
     },
 
-    disableBeginEndClientAttributes: function() {
+    disableBeginEndClientAttributes: function disableBeginEndClientAttributes() {
       for (var i = 0; i < this.NUM_ATTRIBUTES; i++) {
         if (this.rendererComponents[i]) this.enabledClientAttributes[i] = false;
       }
     },
 
-    getRenderer: function() {
+    getRenderer: function getRenderer() {
+      // If no FFP state has changed that would have forced to re-evaluate which FFP emulation shader to use,
+      // we have the currently used renderer in cache, and can immediately return that.
+      if (this.currentRenderer) {
+        return this.currentRenderer;
+      }
       // return a renderer object given the liveClientAttributes
       // we maintain a cache of renderers, optimized to not generate garbage
       var attributes = GL.immediate.liveClientAttributes;
@@ -3320,10 +3488,11 @@ var LibraryGL = {
       var keyView = cacheMap.getStaticKeyView().reset();
 
       // By attrib state:
+      var enabledAttributesKey = 0;
       for (var i = 0; i < attributes.length; i++) {
-        var attribute = attributes[i];
-        keyView.next(attribute.name).next(attribute.size).next(attribute.type);
+        enabledAttributesKey |= 1 << attributes[i].name;
       }
+      keyView.next(enabledAttributesKey);
 
       // By fog state:
       var fogParam = 0;
@@ -3349,18 +3518,23 @@ var LibraryGL = {
       }
 
       // If we don't already have it, create it.
-      if (!keyView.get()) {
+      var renderer = keyView.get();
+      if (!renderer) {
 #if GL_DEBUG
         Module.printErr('generating renderer for ' + JSON.stringify(attributes));
 #endif
-        keyView.set(this.createRenderer());
+        renderer = this.createRenderer();
+        this.currentRenderer = renderer;
+        keyView.set(renderer);
+        return renderer;
       }
-      return keyView.get();
+      this.currentRenderer = renderer; // Cache the currently used renderer, so later lookups without state changes can get this fast.
+      return renderer;
     },
 
-    createRenderer: function(renderer) {
+    createRenderer: function createRenderer(renderer) {
       var useCurrProgram = !!GL.currProgram;
-      var hasTextures = false, textureSizes = [], textureTypes = [];
+      var hasTextures = false;
       for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
         var texAttribName = GL.immediate.TEXTURE0 + i;
         if (!GL.immediate.enabledClientAttributes[texAttribName])
@@ -3374,24 +3548,11 @@ var LibraryGL = {
         }
 #endif
 
-        textureSizes[i] = GL.immediate.clientAttributes[texAttribName].size;
-        textureTypes[i] = GL.immediate.clientAttributes[texAttribName].type;
         hasTextures = true;
       }
-      var positionSize = GL.immediate.clientAttributes[GL.immediate.VERTEX].size;
-      var positionType = GL.immediate.clientAttributes[GL.immediate.VERTEX].type;
-      var colorSize = 0, colorType;
-      if (GL.immediate.enabledClientAttributes[GL.immediate.COLOR]) {
-        colorSize = GL.immediate.clientAttributes[GL.immediate.COLOR].size;
-        colorType = GL.immediate.clientAttributes[GL.immediate.COLOR].type;
-      }
-      var normalSize = 0, normalType;
-      if (GL.immediate.enabledClientAttributes[GL.immediate.NORMAL]) {
-        normalSize = GL.immediate.clientAttributes[GL.immediate.NORMAL].size;
-        normalType = GL.immediate.clientAttributes[GL.immediate.NORMAL].type;
-      }
+
       var ret = {
-        init: function() {
+        init: function init() {
           // For fixed-function shader generation.
           var uTexUnitPrefix = 'u_texUnit';
           var aTexCoordPrefix = 'a_texCoord';
@@ -3524,10 +3685,25 @@ var LibraryGL = {
             this.program = Module.ctx.createProgram();
             Module.ctx.attachShader(this.program, this.vertexShader);
             Module.ctx.attachShader(this.program, this.fragmentShader);
-            Module.ctx.bindAttribLocation(this.program, 0, 'a_position');
+
+            // As optimization, bind all attributes to prespecified locations, so that the FFP emulation
+            // code can submit attributes to any generated FFP shader without having to examine each shader in turn.
+            // These prespecified locations are only assumed if GL_FFP_ONLY is specified, since user could also create their
+            // own shaders that didn't have attributes in the same locations.
+            Module.ctx.bindAttribLocation(this.program, GL.immediate.VERTEX, 'a_position');
+            Module.ctx.bindAttribLocation(this.program, GL.immediate.COLOR, 'a_color');
+            Module.ctx.bindAttribLocation(this.program, GL.immediate.NORMAL, 'a_normal');
+            for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
+              Module.ctx.bindAttribLocation(this.program, GL.immediate.TEXTURE0 + i, 'a_texCoord'+i);
+              Module.ctx.bindAttribLocation(this.program, GL.immediate.TEXTURE0 + i, aTexCoordPrefix+i);
+            }
             Module.ctx.linkProgram(this.program);
           }
 
+          // Stores a map that remembers which matrix uniforms are up-to-date in this FFP renderer, so they don't need to be resubmitted
+          // each time we render with this program.
+          this.textureMatrixVersion = {};
+
           this.positionLocation = Module.ctx.getAttribLocation(this.program, 'a_position');
 
           this.texCoordLocations = [];
@@ -3570,7 +3746,9 @@ var LibraryGL = {
           this.projectionLocation = Module.ctx.getUniformLocation(this.program, 'u_projection');
 
           this.hasTextures = hasTextures;
-          this.hasNormal = normalSize > 0 && this.normalLocation >= 0;
+          this.hasNormal = GL.immediate.enabledClientAttributes[GL.immediate.NORMAL] &&
+                           GL.immediate.clientAttributes[GL.immediate.NORMAL].size > 0 &&
+                           this.normalLocation >= 0;
           this.hasColor = (this.colorLocation === 0) || this.colorLocation > 0;
 
           this.floatType = Module.ctx.FLOAT; // minor optimization
@@ -3583,7 +3761,7 @@ var LibraryGL = {
                            this.fogScaleLocation || this.fogDensityLocation);
         },
 
-        prepare: function() {
+        prepare: function prepare() {
           // Calculate the array buffer
           var arrayBuffer;
           if (!GL.currArrayBuffer) {
@@ -3598,10 +3776,10 @@ var LibraryGL = {
             arrayBuffer = GL.currArrayBuffer;
           }
 
+#if GL_UNSAFE_OPTS
           // If the array buffer is unchanged and the renderer as well, then we can avoid all the work here
           // XXX We use some heuristics here, and this may not work in all cases. Try disabling GL_UNSAFE_OPTS if you
           // have odd glitches
-#if GL_UNSAFE_OPTS
           var lastRenderer = GL.immediate.lastRenderer;
           var canSkip = this == lastRenderer &&
                         arrayBuffer == GL.immediate.lastArrayBuffer &&
@@ -3636,62 +3814,105 @@ var LibraryGL = {
             GL.immediate.fixedFunctionProgram = this.program;
           }
 
-          if (this.modelViewLocation) Module.ctx.uniformMatrix4fv(this.modelViewLocation, false, GL.immediate.matrix['m']);
-          if (this.projectionLocation) Module.ctx.uniformMatrix4fv(this.projectionLocation, false, GL.immediate.matrix['p']);
+          if (this.modelViewLocation && this.modelViewMatrixVersion != GL.immediate.matrixVersion['m']) {
+            this.modelViewMatrixVersion = GL.immediate.matrixVersion['m'];
+            Module.ctx.uniformMatrix4fv(this.modelViewLocation, false, GL.immediate.matrix['m']);
+          }
+          if (this.projectionLocation && this.projectionMatrixVersion != GL.immediate.matrixVersion['p']) {
+            this.projectionMatrixVersion = GL.immediate.matrixVersion['p'];
+            Module.ctx.uniformMatrix4fv(this.projectionLocation, false, GL.immediate.matrix['p']);
+          }
 
           var clientAttributes = GL.immediate.clientAttributes;
+          var posAttr = clientAttributes[GL.immediate.VERTEX];
 
 #if GL_ASSERTIONS
-          GL.validateVertexAttribPointer(positionSize, positionType, GL.immediate.stride, clientAttributes[GL.immediate.VERTEX].offset);
+          GL.validateVertexAttribPointer(posAttr.size, posAttr.type, GL.immediate.stride, clientAttributes[GL.immediate.VERTEX].offset);
 #endif
-          Module.ctx.vertexAttribPointer(this.positionLocation, positionSize, positionType, false,
-                                         GL.immediate.stride, clientAttributes[GL.immediate.VERTEX].offset);
+
+#if GL_FFP_ONLY
+          if (!GL.currArrayBuffer) {
+            Module.ctx.vertexAttribPointer(GL.immediate.VERTEX, posAttr.size, posAttr.type, false, GL.immediate.stride, posAttr.offset);
+            GL.enableVertexAttribArray(GL.immediate.VERTEX);
+            if (this.hasNormal) {
+              var normalAttr = clientAttributes[GL.immediate.NORMAL];
+              Module.ctx.vertexAttribPointer(GL.immediate.NORMAL, normalAttr.size, normalAttr.type, true, GL.immediate.stride, normalAttr.offset);
+              GL.enableVertexAttribArray(GL.immediate.NORMAL);
+            }
+          }
+#else
+          Module.ctx.vertexAttribPointer(this.positionLocation, posAttr.size, posAttr.type, false, GL.immediate.stride, posAttr.offset);
           Module.ctx.enableVertexAttribArray(this.positionLocation);
+          if (this.hasNormal) {
+            var normalAttr = clientAttributes[GL.immediate.NORMAL];
+#if GL_ASSERTIONS
+            GL.validateVertexAttribPointer(normalAttr.size, normalAttr.type, GL.immediate.stride, normalAttr.offset);
+#endif
+            Module.ctx.vertexAttribPointer(this.normalLocation, normalAttr.size, normalAttr.type, true, GL.immediate.stride, normalAttr.offset);
+            Module.ctx.enableVertexAttribArray(this.normalLocation);
+          }
+#endif
           if (this.hasTextures) {
-            //for (var i = 0; i < this.usedTexUnitList.length; i++) {
-            //  var texUnitID = this.usedTexUnitList[i];
             for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
-              var texUnitID = i;
-              var attribLoc = this.texCoordLocations[texUnitID];
+#if GL_FFP_ONLY
+              if (!GL.currArrayBuffer) {
+                var attribLoc = GL.immediate.TEXTURE0+i;
+                var texAttr = clientAttributes[attribLoc];
+                if (texAttr.size) {
+                  Module.ctx.vertexAttribPointer(attribLoc, texAttr.size, texAttr.type, false, GL.immediate.stride, texAttr.offset);
+                  GL.enableVertexAttribArray(attribLoc);
+                } else {
+                  // These two might be dangerous, but let's try them.
+                  Module.ctx.vertexAttrib4f(attribLoc, 0, 0, 0, 1);
+                  GL.disableVertexAttribArray(attribLoc);
+                }
+              }
+#else
+              var attribLoc = this.texCoordLocations[i];
               if (attribLoc === undefined || attribLoc < 0) continue;
+              var texAttr = clientAttributes[GL.immediate.TEXTURE0+i];
 
-              if (texUnitID < textureSizes.length && textureSizes[texUnitID]) {
+              if (texAttr.size) {
 #if GL_ASSERTIONS
-                GL.validateVertexAttribPointer(textureSizes[texUnitID], textureTypes[texUnitID], GL.immediate.stride, GL.immediate.clientAttributes[GL.immediate.TEXTURE0 + texUnitID].offset);
+                GL.validateVertexAttribPointer(texAttr.size, texAttr.type, GL.immediate.stride, texAttr.offset);
 #endif
-                Module.ctx.vertexAttribPointer(attribLoc, textureSizes[texUnitID], textureTypes[texUnitID], false,
-                                               GL.immediate.stride, GL.immediate.clientAttributes[GL.immediate.TEXTURE0 + texUnitID].offset);
+                Module.ctx.vertexAttribPointer(attribLoc, texAttr.size, texAttr.type, false, GL.immediate.stride, texAttr.offset);
                 Module.ctx.enableVertexAttribArray(attribLoc);
               } else {
                 // These two might be dangerous, but let's try them.
                 Module.ctx.vertexAttrib4f(attribLoc, 0, 0, 0, 1);
                 Module.ctx.disableVertexAttribArray(attribLoc);
               }
-            }
-            for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
-              if (this.textureMatrixLocations[i]) { // XXX might we need this even without the condition we are currently in?
-                Module.ctx.uniformMatrix4fv(this.textureMatrixLocations[i], false, GL.immediate.matrix['t' + i]);
+#endif
+              var t = 't'+i;
+              if (this.textureMatrixLocations[i] && this.textureMatrixVersion[t] != GL.immediate.matrixVersion[t]) { // XXX might we need this even without the condition we are currently in?
+                this.textureMatrixVersion[t] = GL.immediate.matrixVersion[t];
+                Module.ctx.uniformMatrix4fv(this.textureMatrixLocations[i], false, GL.immediate.matrix[t]);
               }
             }
           }
-          if (colorSize) {
+          if (GL.immediate.enabledClientAttributes[GL.immediate.COLOR]) {
+            var colorAttr = clientAttributes[GL.immediate.COLOR];
 #if GL_ASSERTIONS
-            GL.validateVertexAttribPointer(colorSize, colorType, GL.immediate.stride, clientAttributes[GL.immediate.COLOR].offset);
+            GL.validateVertexAttribPointer(colorAttr.size, colorAttr.type, GL.immediate.stride, colorAttr.offset);
 #endif
-            Module.ctx.vertexAttribPointer(this.colorLocation, colorSize, colorType, true,
-                                           GL.immediate.stride, clientAttributes[GL.immediate.COLOR].offset);
+#if GL_FFP_ONLY
+            if (!GL.currArrayBuffer) {
+              Module.ctx.vertexAttribPointer(GL.immediate.COLOR, colorAttr.size, colorAttr.type, true, GL.immediate.stride, colorAttr.offset);
+              GL.enableVertexAttribArray(GL.immediate.COLOR);
+            }
+#else
+            Module.ctx.vertexAttribPointer(this.colorLocation, colorAttr.size, colorAttr.type, true, GL.immediate.stride, colorAttr.offset);
             Module.ctx.enableVertexAttribArray(this.colorLocation);
+#endif
           } else if (this.hasColor) {
+#if GL_FFP_ONLY
+            GL.disableVertexAttribArray(GL.immediate.COLOR);
+            Module.ctx.vertexAttrib4fv(GL.immediate.COLOR, GL.immediate.clientColor);
+#else
             Module.ctx.disableVertexAttribArray(this.colorLocation);
             Module.ctx.vertexAttrib4fv(this.colorLocation, GL.immediate.clientColor);
-          }
-          if (this.hasNormal) {
-#if GL_ASSERTIONS
-            GL.validateVertexAttribPointer(normalSize, normalType, GL.immediate.stride, clientAttributes[GL.immediate.NORMAL].offset);
 #endif
-            Module.ctx.vertexAttribPointer(this.normalLocation, normalSize, normalType, true,
-                                           GL.immediate.stride, clientAttributes[GL.immediate.NORMAL].offset);
-            Module.ctx.enableVertexAttribArray(this.normalLocation);
           }
           if (this.hasFog) {
             if (this.fogColorLocation) Module.ctx.uniform4fv(this.fogColorLocation, GLEmulation.fogColor);
@@ -3701,11 +3922,12 @@ var LibraryGL = {
           }
         },
 
-        cleanup: function() {
+        cleanup: function cleanup() {
+#if !GL_FFP_ONLY
           Module.ctx.disableVertexAttribArray(this.positionLocation);
           if (this.hasTextures) {
-            for (var i = 0; i < textureSizes.length; i++) {
-              if (textureSizes[i] && this.texCoordLocations[i] >= 0) {
+            for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
+              if (GL.immediate.enabledClientAttributes[GL.immediate.TEXTURE0+i] && this.texCoordLocations[i] >= 0) {
                 Module.ctx.disableVertexAttribArray(this.texCoordLocations[i]);
               }
             }
@@ -3729,6 +3951,7 @@ var LibraryGL = {
           GL.immediate.lastProgram = null;
 #endif
           GL.immediate.matricesModified = true;
+#endif
         }
       };
       ret.init();
@@ -3858,11 +4081,15 @@ var LibraryGL = {
 
       this.TexEnvJIT.init(Module.ctx);
 
-      GL.immediate.MAX_TEXTURES = Module.ctx.getParameter(Module.ctx.MAX_TEXTURE_IMAGE_UNITS);
-      GL.immediate.NUM_ATTRIBUTES = GL.immediate.TEXTURE0 + GL.immediate.MAX_TEXTURES;
+      // User can override the maximum number of texture units that we emulate. Using fewer texture units increases runtime performance
+      // slightly, so it is advantageous to choose as small value as needed.
+      GL.immediate.MAX_TEXTURES = Module['GL_MAX_TEXTURE_IMAGE_UNITS'] || Module.ctx.getParameter(Module.ctx.MAX_TEXTURE_IMAGE_UNITS);
+      GL.immediate.NUM_ATTRIBUTES = 3 /*pos+normal+color attributes*/ + GL.immediate.MAX_TEXTURES;
       GL.immediate.clientAttributes = [];
+      GLEmulation.enabledClientAttribIndices = [];
       for (var i = 0; i < GL.immediate.NUM_ATTRIBUTES; i++) {
         GL.immediate.clientAttributes.push({});
+        GLEmulation.enabledClientAttribIndices.push(false);
       }
 
       this.matrixStack['m'] = [];
@@ -3872,13 +4099,18 @@ var LibraryGL = {
       }
 
       // Initialize matrix library
-
+      // When user sets a matrix, increment a 'version number' on the new data, and when rendering, submit
+      // the matrices to the shader program only if they have an old version of the data.
+      GL.immediate.matrixVersion = {};
       GL.immediate.matrix['m'] = GL.immediate.matrix.lib.mat4.create();
+      GL.immediate.matrixVersion['m'] = 0;
       GL.immediate.matrix.lib.mat4.identity(GL.immediate.matrix['m']);
       GL.immediate.matrix['p'] = GL.immediate.matrix.lib.mat4.create();
+      GL.immediate.matrixVersion['p'] = 0;
       GL.immediate.matrix.lib.mat4.identity(GL.immediate.matrix['p']);
       for (var i = 0; i < GL.immediate.MAX_TEXTURES; i++) {
         GL.immediate.matrix['t' + i] = GL.immediate.matrix.lib.mat4.create();
+        GL.immediate.matrixVersion['t' + i] = 0;
       }
 
       // Renderer cache
@@ -3899,7 +4131,7 @@ var LibraryGL = {
     // Modifies liveClientAttributes, stride, vertexPointer, vertexCounter
     //   count: number of elements we will draw
     //   beginEnd: whether we are drawing the results of a begin/end block
-    prepareClientAttributes: function(count, beginEnd) {
+    prepareClientAttributes: function prepareClientAttributes(count, beginEnd) {
       // If no client attributes were modified since we were last called, do nothing. Note that this
       // does not work for glBegin/End, where we generate renderer components dynamically and then
       // disable them ourselves, but it does help with glDrawElements/Arrays.
@@ -3997,7 +4229,7 @@ var LibraryGL = {
       }
     },
 
-    flush: function(numProvidedIndexes, startIndex, ptr) {
+    flush: function flush(numProvidedIndexes, startIndex, ptr) {
 #if ASSERTIONS
       assert(numProvidedIndexes >= 0 || !numProvidedIndexes);
 #endif
@@ -4070,7 +4302,7 @@ var LibraryGL = {
         Module.ctx.bindBuffer(Module.ctx.ELEMENT_ARRAY_BUFFER, GL.buffers[GL.currElementArrayBuffer] || null);
       }
 
-#if GL_UNSAFE_OPTS == 0
+#if GL_UNSAFE_OPTS == 0 && !GL_FFP_ONLY
       renderer.cleanup();
 #endif
     }
@@ -4237,7 +4469,7 @@ var LibraryGL = {
   glColor4ubv__deps: ['glColor4ub'],
   glColor4ubv: function(p) {
     _glColor4ub({{{ makeGetValue('p', '0', 'i8') }}}, {{{ makeGetValue('p', '1', 'i8') }}}, {{{ makeGetValue('p', '2', 'i8') }}}, {{{ makeGetValue('p', '3', 'i8') }}});
-	},
+  },
 
   glFogf: function(pname, param) { // partial support, TODO
     switch(pname) {
@@ -4318,10 +4550,12 @@ var LibraryGL = {
     if (disable && GL.immediate.enabledClientAttributes[attrib]) {
       GL.immediate.enabledClientAttributes[attrib] = false;
       GL.immediate.totalEnabledClientAttributes--;
+      this.currentRenderer = null; // Will need to change current renderer, since the set of active vertex pointers changed.
       if (GLEmulation.currentVao) delete GLEmulation.currentVao.enabledClientStates[cap];
     } else if (!disable && !GL.immediate.enabledClientAttributes[attrib]) {
       GL.immediate.enabledClientAttributes[attrib] = true;
       GL.immediate.totalEnabledClientAttributes++;
+      this.currentRenderer = null; // Will need to change current renderer, since the set of active vertex pointers changed.
       if (GLEmulation.currentVao) GLEmulation.currentVao.enabledClientStates[cap] = 1;
     }
     GL.immediate.modifiedClientAttributes = true;
@@ -4333,15 +4567,40 @@ var LibraryGL = {
   glVertexPointer__deps: ['$GLEmulation'], // if any pointers are used, glVertexPointer must be, and if it is, then we need emulation
   glVertexPointer: function(size, type, stride, pointer) {
     GL.immediate.setClientAttribute(GL.immediate.VERTEX, size, type, stride, pointer);
+#if GL_FFP_ONLY
+    if (GL.currArrayBuffer) {
+      Module.ctx.vertexAttribPointer(GL.immediate.VERTEX, size, type, false, stride, pointer);
+      GL.enableVertexAttribArray(GL.immediate.VERTEX);
+    }
+#endif
   },
   glTexCoordPointer: function(size, type, stride, pointer) {
     GL.immediate.setClientAttribute(GL.immediate.TEXTURE0 + GL.immediate.clientActiveTexture, size, type, stride, pointer);
+#if GL_FFP_ONLY
+    if (GL.currArrayBuffer) {
+      var loc = GL.immediate.TEXTURE0 + GL.immediate.clientActiveTexture;
+      Module.ctx.vertexAttribPointer(loc, size, type, false, stride, pointer);
+      GL.enableVertexAttribArray(loc);
+    }
+#endif
   },
   glNormalPointer: function(type, stride, pointer) {
     GL.immediate.setClientAttribute(GL.immediate.NORMAL, 3, type, stride, pointer);
+#if GL_FFP_ONLY
+    if (GL.currArrayBuffer) {
+      Module.ctx.vertexAttribPointer(GL.immediate.NORMAL, size, type, true, stride, pointer);
+      GL.enableVertexAttribArray(GL.immediate.NORMAL);
+    }
+#endif
   },
   glColorPointer: function(size, type, stride, pointer) {
     GL.immediate.setClientAttribute(GL.immediate.COLOR, size, type, stride, pointer);
+#if GL_FFP_ONLY
+    if (GL.currArrayBuffer) {
+      Module.ctx.vertexAttribPointer(GL.immediate.COLOR, size, type, true, stride, pointer);
+      GL.enableVertexAttribArray(GL.immediate.COLOR);
+    }
+#endif
   },
 
   glClientActiveTexture__sig: 'vi',
@@ -4424,23 +4683,27 @@ var LibraryGL = {
 
   glPushMatrix: function() {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrixStack[GL.immediate.currentMatrix].push(
         Array.prototype.slice.call(GL.immediate.matrix[GL.immediate.currentMatrix]));
   },
 
   glPopMatrix: function() {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix[GL.immediate.currentMatrix] = GL.immediate.matrixStack[GL.immediate.currentMatrix].pop();
   },
 
   glLoadIdentity__deps: ['$GL', '$GLImmediateSetup'],
   glLoadIdentity: function() {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.identity(GL.immediate.matrix[GL.immediate.currentMatrix]);
   },
 
   glLoadMatrixd: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F64', 'matrix', 'matrix+' + (16*8)) }}}, GL.immediate.matrix[GL.immediate.currentMatrix]);
   },
 
@@ -4449,35 +4712,41 @@ var LibraryGL = {
     if (GL.debug) Module.printErr('glLoadMatrixf receiving: ' + Array.prototype.slice.call(HEAPF32.subarray(matrix >> 2, (matrix >> 2) + 16)));
 #endif
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F32', 'matrix', 'matrix+' + (16*4)) }}}, GL.immediate.matrix[GL.immediate.currentMatrix]);
   },
 
   glLoadTransposeMatrixd: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F64', 'matrix', 'matrix+' + (16*8)) }}}, GL.immediate.matrix[GL.immediate.currentMatrix]);
     GL.immediate.matrix.lib.mat4.transpose(GL.immediate.matrix[GL.immediate.currentMatrix]);
   },
 
   glLoadTransposeMatrixf: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F32', 'matrix', 'matrix+' + (16*4)) }}}, GL.immediate.matrix[GL.immediate.currentMatrix]);
     GL.immediate.matrix.lib.mat4.transpose(GL.immediate.matrix[GL.immediate.currentMatrix]);
   },
 
   glMultMatrixd: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.multiply(GL.immediate.matrix[GL.immediate.currentMatrix],
         {{{ makeHEAPView('F64', 'matrix', 'matrix+' + (16*8)) }}});
   },
 
   glMultMatrixf: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.multiply(GL.immediate.matrix[GL.immediate.currentMatrix],
         {{{ makeHEAPView('F32', 'matrix', 'matrix+' + (16*4)) }}});
   },
 
   glMultTransposeMatrixd: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     var colMajor = GL.immediate.matrix.lib.mat4.create();
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F64', 'matrix', 'matrix+' + (16*8)) }}}, colMajor);
     GL.immediate.matrix.lib.mat4.transpose(colMajor);
@@ -4486,6 +4755,7 @@ var LibraryGL = {
 
   glMultTransposeMatrixf: function(matrix) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     var colMajor = GL.immediate.matrix.lib.mat4.create();
     GL.immediate.matrix.lib.mat4.set({{{ makeHEAPView('F32', 'matrix', 'matrix+' + (16*4)) }}}, colMajor);
     GL.immediate.matrix.lib.mat4.transpose(colMajor);
@@ -4494,6 +4764,7 @@ var LibraryGL = {
 
   glFrustum: function(left, right, bottom, top_, nearVal, farVal) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.multiply(GL.immediate.matrix[GL.immediate.currentMatrix],
         GL.immediate.matrix.lib.mat4.frustum(left, right, bottom, top_, nearVal, farVal));
   },
@@ -4501,6 +4772,7 @@ var LibraryGL = {
 
   glOrtho: function(left, right, bottom, top_, nearVal, farVal) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.multiply(GL.immediate.matrix[GL.immediate.currentMatrix],
         GL.immediate.matrix.lib.mat4.ortho(left, right, bottom, top_, nearVal, farVal));
   },
@@ -4508,18 +4780,21 @@ var LibraryGL = {
 
   glScaled: function(x, y, z) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.scale(GL.immediate.matrix[GL.immediate.currentMatrix], [x, y, z]);
   },
   glScalef: 'glScaled',
 
   glTranslated: function(x, y, z) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.translate(GL.immediate.matrix[GL.immediate.currentMatrix], [x, y, z]);
   },
   glTranslatef: 'glTranslated',
 
   glRotated: function(angle, x, y, z) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.rotate(GL.immediate.matrix[GL.immediate.currentMatrix], angle*Math.PI/180, [x, y, z]);
   },
   glRotatef: 'glRotated',
@@ -4602,6 +4877,7 @@ var LibraryGL = {
 
   gluPerspective: function(fov, aspect, near, far) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix[GL.immediate.currentMatrix] =
       GL.immediate.matrix.lib.mat4.perspective(fov, aspect, near, far,
                                                GL.immediate.matrix[GL.immediate.currentMatrix]);
@@ -4609,6 +4885,7 @@ var LibraryGL = {
 
   gluLookAt: function(ex, ey, ez, cx, cy, cz, ux, uy, uz) {
     GL.immediate.matricesModified = true;
+    GL.immediate.matrixVersion[GL.immediate.currentMatrix] = (GL.immediate.matrixVersion[GL.immediate.currentMatrix] + 1)|0;
     GL.immediate.matrix.lib.mat4.lookAt(GL.immediate.matrix[GL.immediate.currentMatrix], [ex, ey, ez],
         [cx, cy, cz], [ux, uy, uz]);
   },
diff --git a/src/modules.js b/src/modules.js
index 79f494c0..29fca664 100644
--- a/src/modules.js
+++ b/src/modules.js
@@ -500,6 +500,7 @@ var PassManager = {
     for (var i in data.Functions) {
       Functions[i] = data.Functions[i];
     }
+    EXPORTED_FUNCTIONS = data.EXPORTED_FUNCTIONS;
     /*
     print('\n//LOADED_DATA:' + phase + ':' + JSON.stringify({
       Types: Types,
diff --git a/src/parseTools.js b/src/parseTools.js
index 08cf9b60..ffd7c758 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -362,7 +362,7 @@ function getVectorNativeType(type) {
 
 function getSIMDName(type) {
   switch (type) {
-    case 'i32': return 'uint';
+    case 'i32': return 'int';
     case 'float': return 'float';
     default: throw 'getSIMDName ' + type;
   }
@@ -603,10 +603,11 @@ function parseLLVMSegment(segment) {
     type = segment[0].text;
     if (type[type.length-1] === '>' && segment[1].text[0] === '<') {
       // vector literal
+      var nativeType = getVectorNativeType(type);
       return {
         intertype: 'vector',
         idents: splitTokenList(segment[1].tokens).map(function(pair) {
-          return pair[1].text;
+          return parseNumerical(pair[1].text, nativeType);
         }),
         type: type
       };
@@ -2371,29 +2372,28 @@ function processMathop(item) {
     // vector/SIMD operation
     Types.usesSIMD = true;
     switch (op) {
-      case 'fadd': return 'SIMD.add(' + idents[0] + ',' + idents[1] + ')';
-      case 'fsub': return 'SIMD.sub(' + idents[0] + ',' + idents[1] + ')';
-      case 'fmul': return 'SIMD.mul(' + idents[0] + ',' + idents[1] + ')';
-      case 'fdiv': return 'SIMD.div(' + idents[0] + ',' + idents[1] + ')';
-      case 'add' : return 'SIMD.addu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'sub' : return 'SIMD.subu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'mul' : return 'SIMD.mulu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'udiv': return 'SIMD.divu32(' + idents[0] + ',' + idents[1] + ')';
+      case 'fadd': return 'SIMD.float32x4.add(' + idents[0] + ',' + idents[1] + ')';
+      case 'fsub': return 'SIMD.float32x4.sub(' + idents[0] + ',' + idents[1] + ')';
+      case 'fmul': return 'SIMD.float32x4.mul(' + idents[0] + ',' + idents[1] + ')';
+      case 'fdiv': return 'SIMD.float32x4.div(' + idents[0] + ',' + idents[1] + ')';
+      case 'add' : return 'SIMD.int32x4.add(' + idents[0] + ',' + idents[1] + ')';
+      case 'sub' : return 'SIMD.int32x4.sub(' + idents[0] + ',' + idents[1] + ')';
+      case 'mul' : return 'SIMD.int32x4.mul(' + idents[0] + ',' + idents[1] + ')';
       case 'bitcast': {
         var inType = item.params[0].type;
         var outType = item.type;
         if (inType === '<4 x float>') {
           assert(outType === '<4 x i32>');
-          return 'SIMD.float32x4BitsToUint32x4(' + idents[0] + ')';
+          return 'SIMD.float32x4.bitsToInt32x4(' + idents[0] + ')';
         } else {
           assert(inType === '<4 x i32>');
           assert(outType === '<4 x float>');
-          return 'SIMD.uint32x4BitsToFloat32x4(' + idents[0] + ')';
+          return 'SIMD.int32x4.bitsToFloat32x4(' + idents[0] + ')';
         }
       }
-      case 'and': return 'SIMD.and(' + idents[0] + ',' + idents[1] + ')';
-      case 'or': return 'SIMD.or(' + idents[0] + ',' + idents[1] + ')';
-      case 'xor': return 'SIMD.xor(' + idents[0] + ',' + idents[1] + ')';
+      case 'and': return 'SIMD.int32x4.and(' + idents[0] + ',' + idents[1] + ')';
+      case 'or': return 'SIMD.int32x4.or(' + idents[0] + ',' + idents[1] + ')';
+      case 'xor': return 'SIMD.int32x4.xor(' + idents[0] + ',' + idents[1] + ')';
       default: throw 'vector op todo: ' + dump(item);
     }
   }
@@ -2697,7 +2697,7 @@ var simdLane = ['x', 'y', 'z', 'w'];
 
 function ensureVector(ident, base) {
   Types.usesSIMD = true;
-  return ident == 0 ? base + '32x4.zero()' : ident;
+  return ident == 0 ? base + '32x4.splat(0)' : ident;
 }
 
 function ensureValidFFIType(type) {
diff --git a/src/simd.js b/src/simd.js
index bbb12d0a..c7f5ff48 100644
--- a/src/simd.js
+++ b/src/simd.js
@@ -20,8 +20,10 @@
   https://github.com/johnmccutchan/ecmascript_simd/blob/master/src/ecmascript_simd.js
 */
 
+"use strict";
+
 /**
-  * Construct a new instance of a float32x4 number.
+  * Construct a new instance of float32x4 number.
   * @param {double} value used for x lane.
   * @param {double} value used for y lane.
   * @param {double} value used for z lane.
@@ -40,7 +42,7 @@ function float32x4(x, y, z, w) {
 }
 
 /**
-  * Construct a new instance of a float32x4 number with 0.0 in all lanes.
+  * Construct a new instance of float32x4 number with 0.0 in all lanes.
   * @constructor
   */
 float32x4.zero = function() {
@@ -48,7 +50,7 @@ float32x4.zero = function() {
 }
 
 /**
-  * Construct a new instance of a float32x4 number with the same value
+  * Construct a new instance of float32x4 number with the same value
   * in all lanes.
   * @param {double} value used for all lanes.
   * @constructor
@@ -87,18 +89,18 @@ Object.defineProperty(float32x4.prototype, 'signMask', {
 });
 
 /**
-  * Construct a new instance of a uint32x4 number.
+  * Construct a new instance of int32x4 number.
   * @param {integer} 32-bit unsigned value used for x lane.
   * @param {integer} 32-bit unsigned value used for y lane.
   * @param {integer} 32-bit unsigned value used for z lane.
   * @param {integer} 32-bit unsigned value used for w lane.
   * @constructor
   */
-function uint32x4(x, y, z, w) {
-  if (!(this instanceof uint32x4)) {
-    return new uint32x4(x, y, z, w);
+function int32x4(x, y, z, w) {
+  if (!(this instanceof int32x4)) {
+    return new int32x4(x, y, z, w);
   }
-  this.storage_ = new Uint32Array(4);
+  this.storage_ = new Int32Array(4);
   this.storage_[0] = x;
   this.storage_[1] = y;
   this.storage_[2] = z;
@@ -106,7 +108,7 @@ function uint32x4(x, y, z, w) {
 }
 
 /**
-  * Construct a new instance of a uint32x4 number with 0xFFFFFFFF or 0x0 in each
+  * Construct a new instance of int32x4 number with 0xFFFFFFFF or 0x0 in each
   * lane, depending on the truth value in x, y, z, and w.
   * @param {boolean} flag used for x lane.
   * @param {boolean} flag used for y lane.
@@ -114,59 +116,59 @@ function uint32x4(x, y, z, w) {
   * @param {boolean} flag used for w lane.
   * @constructor
   */
-uint32x4.bool = function(x, y, z, w) {
-  return uint32x4(x ? 0xFFFFFFFF : 0x0,
-                  y ? 0xFFFFFFFF : 0x0,
-                  z ? 0xFFFFFFFF : 0x0,
-                  w ? 0xFFFFFFFF : 0x0);
+int32x4.bool = function(x, y, z, w) {
+  return int32x4(x ? -1 : 0x0,
+                  y ? -1 : 0x0,
+                  z ? -1 : 0x0,
+                  w ? -1 : 0x0);
 }
 
 /**
-  * Construct a new instance of a uint32x4 number with the same value
+  * Construct a new instance of int32x4 number with the same value
   * in all lanes.
   * @param {integer} value used for all lanes.
   * @constructor
   */
-uint32x4.splat = function(s) {
-  return uint32x4(s, s, s, s);
+int32x4.splat = function(s) {
+  return int32x4(s, s, s, s);
 }
 
-Object.defineProperty(uint32x4.prototype, 'x', {
+Object.defineProperty(int32x4.prototype, 'x', {
   get: function() { return this.storage_[0]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'y', {
+Object.defineProperty(int32x4.prototype, 'y', {
   get: function() { return this.storage_[1]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'z', {
+Object.defineProperty(int32x4.prototype, 'z', {
   get: function() { return this.storage_[2]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'w',
+Object.defineProperty(int32x4.prototype, 'w',
   { get: function() { return this.storage_[3]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagX', {
+Object.defineProperty(int32x4.prototype, 'flagX', {
   get: function() { return this.storage_[0] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagY', {
+Object.defineProperty(int32x4.prototype, 'flagY', {
   get: function() { return this.storage_[1] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagZ', {
+Object.defineProperty(int32x4.prototype, 'flagZ', {
   get: function() { return this.storage_[2] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagW',
+Object.defineProperty(int32x4.prototype, 'flagW',
   { get: function() { return this.storage_[3] != 0x0; }
 });
 
 /**
   * Extract the sign bit from each lane return them in the first 4 bits.
   */
-Object.defineProperty(uint32x4.prototype, 'signMask', {
+Object.defineProperty(int32x4.prototype, 'signMask', {
   get: function() {
     var mx = (this.storage_[0] & 0x80000000) >>> 31;
     var my = (this.storage_[1] & 0x80000000) >>> 31;
@@ -287,414 +289,580 @@ Float32x4Array.prototype.setAt = function(i, v) {
   this.storage_[i*4+3] = v.w;
 }
 
+
+function Int32x4Array(a, b, c) {
+
+  function isNumber(o) {
+      return typeof o == "number" || (typeof o == "object" && o.constructor === Number);
+  }
+
+  function isTypedArray(o) {
+    return (o instanceof Int8Array) ||
+           (o instanceof Uint8Array) ||
+           (o instanceof Uint8ClampedArray) ||
+           (o instanceof Int16Array) ||
+           (o instanceof Uint16Array) ||
+           (o instanceof Int32Array) ||
+           (o instanceof Uint32Array) ||
+           (o instanceof Float32Array) ||
+           (o instanceof Float64Array) ||
+           (o instanceof Int32x4Array) ||
+           (o instanceof Float32x4Array);
+  }
+
+  function isArrayBuffer(o) {
+    return (o instanceof ArrayBuffer);
+  }
+
+  if (isNumber(a)) {
+    this.storage_ = new Int32Array(a*4);
+    this.length_ = a;
+    this.byteOffset_ = 0;
+    return;
+  } else if (isTypedArray(a)) {
+    if (!(a instanceof Int32x4Array)) {
+      throw "Copying typed array of non-Int32x4Array is unimplemented.";
+    }
+    this.storage_ = new Int32Array(a.length * 4);
+    this.length_ = a.length;
+    this.byteOffset_ = 0;
+    // Copy floats.
+    for (var i = 0; i < a.length*4; i++) {
+      this.storage_[i] = a.storage_[i];
+    }
+  } else if (isArrayBuffer(a)) {
+    if ((b != undefined) && (b % Int32x4Array.BYTES_PER_ELEMENT) != 0) {
+      throw "byteOffset must be a multiple of 16.";
+    }
+    if (c != undefined) {
+      c *= 4;
+      this.storage_ = new Int32Array(a, b, c);
+    }
+    else {
+      // Note: new Int32Array(a, b) is NOT equivalent to new Float32Array(a, b, undefined)
+      this.storage_ = new Int32Array(a, b);
+    }
+    this.length_ = this.storage_.length / 4;
+    this.byteOffset_ = b != undefined ? b : 0;
+  } else {
+    throw "Unknown type of first argument.";
+  }
+}
+
+Object.defineProperty(Int32x4Array.prototype, 'length',
+  { get: function() { return this.length_; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'byteLength',
+  { get: function() { return this.length_ * Int32x4Array.BYTES_PER_ELEMENT; }
+});
+
+Object.defineProperty(Int32x4Array, 'BYTES_PER_ELEMENT',
+  { get: function() { return 16; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'BYTES_PER_ELEMENT',
+  { get: function() { return 16; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'byteOffset',
+  { get: function() { return this.byteOffset_; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'buffer',
+  { get: function() { return this.storage_.buffer; }
+});
+
+Int32x4Array.prototype.getAt = function(i) {
+  if (i < 0) {
+    throw "Index must be >= 0.";
+  }
+  if (i >= this.length) {
+    throw "Index out of bounds.";
+  }
+  var x = this.storage_[i*4+0];
+  var y = this.storage_[i*4+1];
+  var z = this.storage_[i*4+2];
+  var w = this.storage_[i*4+3];
+  return float32x4(x, y, z, w);
+}
+
+Int32x4Array.prototype.setAt = function(i, v) {
+  if (i < 0) {
+    throw "Index must be >= 0.";
+  }
+  if (i >= this.length) {
+    throw "Index out of bounds.";
+  }
+  if (!(v instanceof int32x4)) {
+    throw "Value is not a int32x4.";
+  }
+  this.storage_[i*4+0] = v.x;
+  this.storage_[i*4+1] = v.y;
+  this.storage_[i*4+2] = v.z;
+  this.storage_[i*4+3] = v.w;
+}
+
 var SIMD = (function () {
   return {
-    /**
-      * @return {float32x4} New instance of float32x4 with absolute values of
-      * t.
-      */
-    abs: function(t) {
-      return new float32x4(Math.abs(t.x), Math.abs(t.y), Math.abs(t.z),
-                           Math.abs(t.w));
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with negated values of
-      * t.
-      */
-    neg: function(t) {
-      return new float32x4(-t.x, -t.y, -t.z, -t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a + b.
-      */
-    add: function(a, b) {
-      return new float32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a - b.
-      */
-    sub: function(a, b) {
-      return new float32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a * b.
-      */
-    mul: function(a, b) {
-      return new float32x4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a / b.
-      */
-    div: function(a, b) {
-      return new float32x4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with t's values clamped
-      * between lowerLimit and upperLimit.
-      */
-    clamp: function(t, lowerLimit, upperLimit) {
-      var cx = t.x < lowerLimit.x ? lowerLimit.x : t.x;
-      var cy = t.y < lowerLimit.y ? lowerLimit.y : t.y;
-      var cz = t.z < lowerLimit.z ? lowerLimit.z : t.z;
-      var cw = t.w < lowerLimit.w ? lowerLimit.w : t.w;
-      cx = cx > upperLimit.x ? upperLimit.x : cx;
-      cy = cy > upperLimit.y ? upperLimit.y : cy;
-      cz = cz > upperLimit.z ? upperLimit.z : cz;
-      cw = cw > upperLimit.w ? upperLimit.w : cw;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with the minimum value of
-      * t and other.
-      */
-    min: function(t, other) {
-      var cx = t.x > other.x ? other.x : t.x;
-      var cy = t.y > other.y ? other.y : t.y;
-      var cz = t.z > other.z ? other.z : t.z;
-      var cw = t.w > other.w ? other.w : t.w;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with the maximum value of
-      * t and other.
-      */
-    max: function(t, other) {
-      var cx = t.x < other.x ? other.x : t.x;
-      var cy = t.y < other.y ? other.y : t.y;
-      var cz = t.z < other.z ? other.z : t.z;
-      var cw = t.w < other.w ? other.w : t.w;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with reciprocal value of
-      * t.
-      */
-    reciprocal: function(t) {
-      return new float32x4(1.0 / t.x, 1.0 / t.y, 1.0 / t.z, 1.0 / t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with square root of the
-      * reciprocal value of t.
-      */
-    reciprocalSqrt: function(t) {
-      return new float32x4(Math.sqrt(1.0 / t.x), Math.sqrt(1.0 / t.y),
-                           Math.sqrt(1.0 / t.z), Math.sqrt(1.0 / t.w));
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with values of t
-      * scaled by s.
-      */
-    scale: function(t, s) {
-      return new float32x4(s * t.x, s * t.y, s * t.z, s * t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with square root of
-      * values of t.
-      */
-    sqrt: function(t) {
-      return new float32x4(Math.sqrt(t.x), Math.sqrt(t.y),
-                           Math.sqrt(t.z), Math.sqrt(t.w));
-    },
-    /**
-      * @param {float32x4} t An instance of float32x4 to be shuffled.
-      * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
-      * @return {float32x4} New instance of float32x4 with lanes shuffled.
-      */
-    shuffle: function(t, mask) {
-      var _x = (mask) & 0x3;
-      var _y = (mask >> 2) & 0x3;
-      var _z = (mask >> 4) & 0x3;
-      var _w = (mask >> 6) & 0x3;
-      return new float32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
-                           t.storage_[_w]);
+    float32x4: {
+        /**
+        * @return {float32x4} New instance of float32x4 with absolute values of
+        * t.
+        */
+      abs: function(t) {
+        return new float32x4(Math.abs(t.x), Math.abs(t.y), Math.abs(t.z),
+                             Math.abs(t.w));
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with negated values of
+        * t.
+        */
+      neg: function(t) {
+        return new float32x4(-t.x, -t.y, -t.z, -t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a + b.
+        */
+      add: function(a, b) {
+        return new float32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a - b.
+        */
+      sub: function(a, b) {
+        return new float32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a * b.
+        */
+      mul: function(a, b) {
+        return new float32x4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a / b.
+        */
+      div: function(a, b) {
+        return new float32x4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with t's values clamped
+        * between lowerLimit and upperLimit.
+        */
+      clamp: function(t, lowerLimit, upperLimit) {
+        var cx = t.x < lowerLimit.x ? lowerLimit.x : t.x;
+        var cy = t.y < lowerLimit.y ? lowerLimit.y : t.y;
+        var cz = t.z < lowerLimit.z ? lowerLimit.z : t.z;
+        var cw = t.w < lowerLimit.w ? lowerLimit.w : t.w;
+        cx = cx > upperLimit.x ? upperLimit.x : cx;
+        cy = cy > upperLimit.y ? upperLimit.y : cy;
+        cz = cz > upperLimit.z ? upperLimit.z : cz;
+        cw = cw > upperLimit.w ? upperLimit.w : cw;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with the minimum value of
+        * t and other.
+        */
+      min: function(t, other) {
+        var cx = t.x > other.x ? other.x : t.x;
+        var cy = t.y > other.y ? other.y : t.y;
+        var cz = t.z > other.z ? other.z : t.z;
+        var cw = t.w > other.w ? other.w : t.w;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with the maximum value of
+        * t and other.
+        */
+      max: function(t, other) {
+        var cx = t.x < other.x ? other.x : t.x;
+        var cy = t.y < other.y ? other.y : t.y;
+        var cz = t.z < other.z ? other.z : t.z;
+        var cw = t.w < other.w ? other.w : t.w;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with reciprocal value of
+        * t.
+        */
+      reciprocal: function(t) {
+        return new float32x4(1.0 / t.x, 1.0 / t.y, 1.0 / t.z, 1.0 / t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with square root of the
+        * reciprocal value of t.
+        */
+      reciprocalSqrt: function(t) {
+        return new float32x4(Math.sqrt(1.0 / t.x), Math.sqrt(1.0 / t.y),
+                             Math.sqrt(1.0 / t.z), Math.sqrt(1.0 / t.w));
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with values of t
+        * scaled by s.
+        */
+      scale: function(t, s) {
+        return new float32x4(s * t.x, s * t.y, s * t.z, s * t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with square root of
+        * values of t.
+        */
+      sqrt: function(t) {
+        return new float32x4(Math.sqrt(t.x), Math.sqrt(t.y),
+                             Math.sqrt(t.z), Math.sqrt(t.w));
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4 to be shuffled.
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {float32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffle: function(t, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new float32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
+                             t.storage_[_w]);
+      },
+      /**
+        * @param {float32x4} t1 An instance of float32x4 to be shuffled. XY lanes in result
+        * @param {float32x4} t2 An instance of float32x4 to be shuffled. ZW lanes in result
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {float32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffleMix: function(t1, t2, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new float32x4(t1.storage_[_x], t1.storage_[_y], t2.storage_[_z],
+                             t2.storage_[_w]);
+      },
+      /**
+        * @param {double} value used for x lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * x replaced with {x}.
+        */
+      withX: function(t, x) {
+        return new float32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * @param {double} value used for y lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * y replaced with {y}.
+        */
+      withY: function(t, y) {
+        return new float32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {double} value used for z lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * z replaced with {z}.
+        */
+      withZ: function(t, z) {
+        return new float32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {double} value used for w lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * w replaced with {w}.
+        */
+      withW: function(t, w) {
+        return new float32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t < other.
+        */
+      lessThan: function(t, other) {
+        var cx = t.x < other.x;
+        var cy = t.y < other.y;
+        var cz = t.z < other.z;
+        var cw = t.w < other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t <= other.
+        */
+      lessThanOrEqual: function(t, other) {
+        var cx = t.x <= other.x;
+        var cy = t.y <= other.y;
+        var cz = t.z <= other.z;
+        var cw = t.w <= other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t == other.
+        */
+      equal: function(t, other) {
+        var cx = t.x == other.x;
+        var cy = t.y == other.y;
+        var cz = t.z == other.z;
+        var cw = t.w == other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t != other.
+        */
+      notEqual: function(t, other) {
+        var cx = t.x != other.x;
+        var cy = t.y != other.y;
+        var cz = t.z != other.z;
+        var cw = t.w != other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t >= other.
+        */
+      greaterThanOrEqual: function(t, other) {
+        var cx = t.x >= other.x;
+        var cy = t.y >= other.y;
+        var cz = t.z >= other.z;
+        var cw = t.w >= other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t > other.
+        */
+      greaterThan: function(t, other) {
+        var cx = t.x > other.x;
+        var cy = t.y > other.y;
+        var cz = t.z > other.z;
+        var cw = t.w > other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @return {int32x4} a bit-wise copy of t as a int32x4.
+        */
+      bitsToInt32x4: function(t) {
+        var alias = new Int32Array(t.storage_.buffer);
+        return new int32x4(alias[0], alias[1], alias[2], alias[3]);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @return {int32x4} with a integer to float conversion of t.
+        */
+      toInt32x4: function(t) {
+        var a = new int32x4(t.storage_[0], t.storage_[1], t.storage_[2],
+                             t.storage_[3]);
+        return a;
+      }
     },
-    /**
-      * @param {double} value used for x lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * x replaced with {x}.
-      */
-    withX: function(t, x) {
-      return new float32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * @param {double} value used for y lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * y replaced with {y}.
-      */
-    withY: function(t, y) {
-      return new float32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {double} value used for z lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * z replaced with {z}.
-      */
-    withZ: function(t, z) {
-      return new float32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {double} value used for w lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * w replaced with {w}.
-      */
-    withW: function(t, w) {
-      return new float32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t < other.
-      */
-    lessThan: function(t, other) {
-      var cx = t.x < other.x;
-      var cy = t.y < other.y;
-      var cz = t.z < other.z;
-      var cw = t.w < other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t <= other.
-      */
-    lessThanOrEqual: function(t, other) {
-      var cx = t.x <= other.x;
-      var cy = t.y <= other.y;
-      var cz = t.z <= other.z;
-      var cw = t.w <= other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t == other.
-      */
-    equal: function(t, other) {
-      var cx = t.x == other.x;
-      var cy = t.y == other.y;
-      var cz = t.z == other.z;
-      var cw = t.w == other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t != other.
-      */
-    notEqual: function(t, other) {
-      var cx = t.x != other.x;
-      var cy = t.y != other.y;
-      var cz = t.z != other.z;
-      var cw = t.w != other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t >= other.
-      */
-    greaterThanOrEqual: function(t, other) {
-      var cx = t.x >= other.x;
-      var cy = t.y >= other.y;
-      var cz = t.z >= other.z;
-      var cw = t.w >= other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t > other.
-      */
-    greaterThan: function(t, other) {
-      var cx = t.x > other.x;
-      var cy = t.y > other.y;
-      var cz = t.z > other.z;
-      var cw = t.w > other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a & b.
-      */
-    and: function(a, b) {
-      return new uint32x4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a | b.
-      */
-    or: function(a, b) {
-      return new uint32x4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a ^ b.
-      */
-    xor: function(a, b) {
-      return new uint32x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of ~a
-      */
-    negu32: function(t) {
-      return new uint32x4(~t.x, ~t.y, ~t.z, ~t.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a + b.
-      */
-    addu32: function(a, b) {
-      return new uint32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a - b.
-      */
-    subu32: function(a, b) {
-      return new uint32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a * b.
-      */
-    mulu32: function(a, b) {
-      return new uint32x4(Math.imul(a.x, b.x), Math.imul(a.y, b.y),
-                          Math.imul(a.z, b.z), Math.imul(a.w, b.w));
-    },
-    /**
-      * @param {float32x4}
-      */
-    select: function(t, trueValue, falseValue) {
-      var tv = SIMD.float32x4BitsToUint32x4(trueValue);
-      var fv = SIMD.float32x4BitsToUint32x4(falseValue);
-      var tr = SIMD.and(t, tv);
-      var fr = SIMD.and(SIMD.negu32(t), fv);
-      return SIMD.uint32x4BitsToFloat32x4(SIMD.or(tr, fr));
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for x lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * x lane replaced with {x}.
-      */
-    withXu32: function(t, x) {
-      return new uint32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for y lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * y lane replaced with {y}.
-      */
-    withYu32: function(t, y) {
-      return new uint32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for z lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * z lane replaced with {z}.
-      */
-    withZu32: function(t, z) {
-      return new uint32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {integer} 32-bit value used for w lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * w lane replaced with {w}.
-      */
-    withWu32: function(t, w) {
-      return new uint32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} x flag used for x lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * x lane replaced with {x}.
-      */
-    withFlagX: function(t, flagX) {
-      var x = flagX ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} y flag used for y lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * y lane replaced with {y}.
-      */
-    withFlagY: function(t, flagY) {
-      var y = flagY ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} z flag used for z lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * z lane replaced with {z}.
-      */
-    withFlagZ: function(t, flagZ) {
-      var z = flagZ ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} w flag used for w lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * w lane replaced with {w}.
-      */
-    withFlagW: function(t, flagW) {
-      var w = flagW ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @return {uint32x4} a bit-wise copy of t as a uint32x4.
-      */
-    float32x4BitsToUint32x4: function(t) {
-      var alias = new Uint32Array(t.storage_.buffer);
-      return new uint32x4(alias[0], alias[1], alias[2], alias[3]);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {float32x4} a bit-wise copy of t as a float32x4.
-      */
-    uint32x4BitsToFloat32x4: function(t) {
-      var alias = new Float32Array(t.storage_.buffer);
-      return new float32x4(alias[0], alias[1], alias[2], alias[3]);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {float32x4} with a float to integer conversion copy of t.
-      */
-    uint32x4ToFloat32x4: function(t) {
-      var a = float32x4.zero();
-      a.storage_[0] = t.storage_[0];
-      a.storage_[1] = t.storage_[1];
-      a.storage_[2] = t.storage_[2];
-      a.storage_[3] = t.storage_[3];
-      return a;
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @return {uint32x4} with a integer to float conversion of t.
-      */
-    float32x4ToUint32x4: function(t) {
-      var a = new uint32x4(t.storage_[0], t.storage_[1], t.storage_[2],
-                           t.storage_[3]);
-      return a;
+    int32x4: {
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a & b.
+        */
+      and: function(a, b) {
+        return new int32x4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a | b.
+        */
+      or: function(a, b) {
+        return new int32x4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a ^ b.
+        */
+      xor: function(a, b) {
+        return new int32x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of ~t
+        */
+      not: function(t) {
+        return new int32x4(~t.x, ~t.y, ~t.z, ~t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of -t
+        */
+      neg: function(t) {
+        return new int32x4(-t.x, -t.y, -t.z, -t.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a + b.
+        */
+      add: function(a, b) {
+        return new int32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a - b.
+        */
+      sub: function(a, b) {
+        return new int32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a * b.
+        */
+      mul: function(a, b) {
+        return new int32x4(Math.imul(a.x, b.x), Math.imul(a.y, b.y),
+                           Math.imul(a.z, b.z), Math.imul(a.w, b.w));
+      },
+      /**
+        * @param {int32x4} t An instance of float32x4 to be shuffled.
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {int32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffle: function(t, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new int32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
+                             t.storage_[_w]);
+      },
+      /**
+        * @param {int32x4} t1 An instance of float32x4 to be shuffled. XY lanes in result
+        * @param {int32x4} t2 An instance of float32x4 to be shuffled. ZW lanes in result
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {int32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffleMix: function(t1, t2, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new int32x4(t1.storage_[_x], t1.storage_[_y], t2.storage_[_z],
+                             t2.storage_[_w]);
+      },
+      /**
+        * @param {float32x4}
+        */
+      select: function(t, trueValue, falseValue) {
+        var tv = SIMD.float32x4.bitsToInt32x4(trueValue);
+        var fv = SIMD.float32x4.bitsToInt32x4(falseValue);
+        var tr = SIMD.int32x4.and(t, tv);
+        var fr = SIMD.int32x4.and(SIMD.int32x4.not(t), fv);
+        return SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.or(tr, fr));
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for x lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * x lane replaced with {x}.
+        */
+      withX: function(t, x) {
+        return new int32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for y lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * y lane replaced with {y}.
+        */
+      withY: function(t, y) {
+        return new int32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for z lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * z lane replaced with {z}.
+        */
+      withZ: function(t, z) {
+        return new int32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {integer} 32-bit value used for w lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * w lane replaced with {w}.
+        */
+      withW: function(t, w) {
+        return new int32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} x flag used for x lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * x lane replaced with {x}.
+        */
+      withFlagX: function(t, flagX) {
+        var x = flagX ? 0xFFFFFFFF : 0x0;
+        return new int32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} y flag used for y lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * y lane replaced with {y}.
+        */
+      withFlagY: function(t, flagY) {
+        var y = flagY ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} z flag used for z lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * z lane replaced with {z}.
+        */
+      withFlagZ: function(t, flagZ) {
+        var z = flagZ ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} w flag used for w lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * w lane replaced with {w}.
+        */
+      withFlagW: function(t, flagW) {
+        var w = flagW ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {float32x4} a bit-wise copy of t as a float32x4.
+        */
+      bitsToFloat32x4: function(t) {
+        var alias = new Float32Array(t.storage_.buffer);
+        return new float32x4(alias[0], alias[1], alias[2], alias[3]);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {float32x4} with a float to integer conversion copy of t.
+        */
+      toFloat32x4: function(t) {
+        var a = float32x4.zero();
+        a.storage_[0] = t.storage_[0];
+        a.storage_[1] = t.storage_[1];
+        a.storage_[2] = t.storage_[2];
+        a.storage_[3] = t.storage_[3];
+        return a;
+      }
     }
   }
 })();
@@ -955,4 +1123,3 @@ Object.defineProperty(SIMD, 'WWWX', { get: function() { return 0x3F; } });
 Object.defineProperty(SIMD, 'WWWY', { get: function() { return 0x7F; } });
 Object.defineProperty(SIMD, 'WWWZ', { get: function() { return 0xBF; } });
 Object.defineProperty(SIMD, 'WWWW', { get: function() { return 0xFF; } });
-
diff --git a/system/include/emscripten/emmintrin.h b/system/include/emscripten/emmintrin.h
new file mode 100644
index 00000000..31265db8
--- /dev/null
+++ b/system/include/emscripten/emmintrin.h
@@ -0,0 +1,87 @@
+#include <xmmintrin.h>
+
+typedef int32x4 __m128i;
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_set_epi32(int z, int y, int x, int w)
+{
+  return (__m128i){ w, x, y, z };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_set1_epi32(int w)
+{
+  return (__m128i){ w, w, w, w };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_setzero_si128()
+{
+  return (__m128i){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_si128(__m128i *p, __m128i a)
+{
+  *p = a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_and_si128(__m128i a, __m128i b)
+{
+  return a & b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_andnot_si128(__m128i a, __m128i b)
+{
+  return ~a & b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_or_si128(__m128i a, __m128i b)
+{
+  return a | b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_xor_si128(__m128i a, __m128i b)
+{
+  return a ^ b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_add_epi32(__m128i a, __m128i b)
+{
+  return a + b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_sub_epi32(__m128i a, __m128i b)
+{
+  return a - b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_castsi128_ps(__m128i a)
+{
+  return emscripten_int32x4_bitsToFloat32x4(a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cvtepi32_ps(__m128i a)
+{
+  return emscripten_int32x4_toFloat32x4(a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_castps_si128(__m128 a)
+{
+  return emscripten_float32x4_bitsToInt32x4(a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_cvtps_epi32(__m128 a)
+{
+  return emscripten_float32x4_toInt32x4(a);
+}
+\ No newline at end of file
diff --git a/system/include/emscripten/vector.h b/system/include/emscripten/vector.h
index 938f2369..cf26a5d6 100644
--- a/system/include/emscripten/vector.h
+++ b/system/include/emscripten/vector.h
@@ -2,7 +2,7 @@
 // Support for the JS SIMD API proposal, https://github.com/johnmccutchan/ecmascript_simd
 
 typedef float float32x4 __attribute__((__vector_size__(16)));
-typedef unsigned int uint32x4 __attribute__((__vector_size__(16)));
+typedef unsigned int int32x4 __attribute__((__vector_size__(16)));
 
 #ifdef __cplusplus
 extern "C" {
@@ -10,6 +10,24 @@ extern "C" {
 
 unsigned int emscripten_float32x4_signmask(float32x4 x);
 
+float32x4 emscripten_float32x4_min(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_max(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_sqrt(float32x4 a);
+float32x4 emscripten_float32x4_lessThan(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_lessThanOrEqual(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_equal(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_greaterThanOrEqual(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_greaterThan(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_and(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_andNot(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_or(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_xor(float32x4 a, float32x4 b);
+
+float32x4 emscripten_int32x4_bitsToFloat32x4(int32x4 a);
+float32x4 emscripten_int32x4_toFloat32x4(int32x4 a);
+int32x4 emscripten_float32x4_bitsToInt32x4(float32x4 a);
+int32x4 emscripten_float32x4_toInt32x4(float32x4 a);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/system/include/emscripten/xmmintrin.h b/system/include/emscripten/xmmintrin.h
new file mode 100644
index 00000000..1b9108fa
--- /dev/null
+++ b/system/include/emscripten/xmmintrin.h
@@ -0,0 +1,131 @@
+#include <vector.h>
+
+typedef float32x4 __m128;
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_set_ps(float z, float y, float x, float w)
+{
+  return (__m128){ w, x, y, z };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_set1_ps(float w)
+{
+  return (__m128){ w, w, w, w };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_ps(float *p, __m128 a)
+{
+  *(__m128 *)p = a;
+}
+
+static __inline__ int __attribute__((__always_inline__))
+_mm_movemask_ps(__m128 a)
+{
+  return emscripten_float32x4_signmask(a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_add_ps(__m128 a, __m128 b)
+{
+  return a + b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_sub_ps(__m128 a, __m128 b)
+{
+  return a - b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_mul_ps(__m128 a, __m128 b)
+{
+  return a * b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_div_ps(__m128 a, __m128 b)
+{
+  return a / b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_min_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_min(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_max_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_max(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_sqrt_ps(__m128 a)
+{
+  return emscripten_float32x4_sqrt(a);
+}
+
+/* TODO: shuffles */
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_lessThan(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmple_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_lessThanOrEqual(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_equal(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_greaterThanOrEqual(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_greaterThan(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_and_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_and(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_andnot_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_andNot(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_or_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_or(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_xor_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_xor(a, b);
+}
diff --git a/tests/test_core.py b/tests/test_core.py
index 67e316e4..ec00c0a5 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -3865,6 +3865,25 @@ Exiting setjmp function, level: 0, prev_jmp: -1
     process.communicate()
     assert process.returncode is 0, 'float.h should agree with our system'
 
+  def test_llvm_used(self):
+    src = r'''
+  #include <stdio.h>
+  #include <emscripten.h>
+  
+  extern "C" {
+    EMSCRIPTEN_KEEPALIVE void foobar(int x) {
+      printf("Worked! %d\n", x);
+    }
+  }
+
+  int main() {
+    emscripten_run_script("Module['_foobar'](10)");
+    return 0;
+  }'''
+    
+    Building.LLVM_OPTS = 3
+    self.do_run(src, 'Worked! 10\n')
+
   def test_emscripten_api(self):
       #if Settings.MICRO_OPTS or Settings.RELOOP or Building.LLVM_OPTS: return self.skip('FIXME')
 
@@ -8792,20 +8811,20 @@ int main(int argc, char **argv) {
     printf("zeros %d, %d, %d, %d\n", (int)c[0], (int)c[1], (int)c[2], (int)c[3]);
   }
   {
-    uint32x4 *a = (uint32x4*)&data[0];
-    uint32x4 *b = (uint32x4*)&data[4];
-    uint32x4 c, d, e, f;
+    int32x4 *a = (int32x4*)&data[0];
+    int32x4 *b = (int32x4*)&data[4];
+    int32x4 c, d, e, f;
     c = *a;
     d = *b;
-    printf("4uints! %d, %d, %d, %d   %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], d[0], d[1], d[2], d[3]);
+    printf("4ints! %d, %d, %d, %d   %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], d[0], d[1], d[2], d[3]);
     e = c+d;
     f = c-d;
-    printf("5uints! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
+    printf("5ints! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
     e = c&d;
     f = c|d;
     e = ~c&d;
     f = c^d;
-    printf("5uintops! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
+    printf("5intops! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
   }
   {
     float32x4 c, d, e, f;
@@ -8823,9 +8842,9 @@ int main(int argc, char **argv) {
 2floats! 48, 68, 92, 120   42, 56, 72, 90
 3floats! 48, 68, 92, 120   2016, 3808, 6624, 10800
 zeros 0, 0, 0, 0
-4uints! 1086324736, 1094713344, 1101004800, 1106247680   1109917696, 1113587712, 1116733440, 1119092736
-5uints! -2098724864, -2086666240, -2077229056, -2069626880   -23592960, -18874368, -15728640, -12845056
-5uintops! 36175872, 35651584, 34603008, 33816576   48758784, 52428800, 53477376, 54788096
+4ints! 1086324736, 1094713344, 1101004800, 1106247680   1109917696, 1113587712, 1116733440, 1119092736
+5ints! -2098724864, -2086666240, -2077229056, -2069626880   -23592960, -18874368, -15728640, -12845056
+5intops! 36175872, 35651584, 34603008, 33816576   48758784, 52428800, 53477376, 54788096
 6floats! -9, 0, 4, 9   -2, -12, 14, 10
 ''')
 
@@ -8876,6 +8895,484 @@ zeros 0, 0, 0, 0
 16.000000
 ''')
 
+  def test_simd3(self):
+    if Settings.USE_TYPED_ARRAYS != 2: return self.skip('needs ta2')
+    if Settings.ASM_JS: Settings.ASM_JS = 2 # does not validate
+    src = r'''
+    #include <iostream>
+    #include <emmintrin.h>
+    #include <assert.h>
+    #include <stdint.h>
+    #include <bitset>
+
+    using namespace std;
+
+    void testSetPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 4.0);
+        assert(ar[1] == 3.0);
+        assert(ar[2] == 2.0);
+        assert(ar[3] == 1.0);
+    }
+
+    void testSet1Ps() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_set1_ps(5.5);
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 5.5);
+        assert(ar[1] == 5.5);
+        assert(ar[2] == 5.5);
+        assert(ar[3] == 5.5);
+    }
+
+    void testSetZeroPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_setzero_ps();
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testSetEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_set_epi32(5, 7, 126, 381);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 381);
+        assert(ar[1] == 126);
+        assert(ar[2] == 7);
+        assert(ar[3] == 5);
+        v = _mm_set_epi32(0x55555555, 0xaaaaaaaa, 0xffffffff, 0x12345678);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x12345678);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xaaaaaaaa);
+        assert(ar[3] == 0x55555555);
+    }
+
+    void testSet1Epi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_set1_epi32(-5);
+        _mm_store_si128((__m128i *)ar, v);    
+        assert(ar[0] == -5);
+        assert(ar[1] == -5);
+        assert(ar[2] == -5);
+        assert(ar[3] == -5);
+    }
+
+    void testSetZeroSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_setzero_si128();
+        _mm_store_si128((__m128i *)ar, v);    
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testBitCasts() {
+        int32_t __attribute__((__aligned__(16))) ar1[4];
+        float __attribute__((__aligned__(16))) ar2[4];
+        __m128i v1 = _mm_set_epi32(0x3f800000, 0x40000000, 0x40400000, 0x40800000);
+        __m128 v2 = _mm_castsi128_ps(v1);
+        _mm_store_ps(ar2, v2);
+        assert(ar2[0] == 4.0);
+        assert(ar2[1] == 3.0);
+        assert(ar2[2] == 2.0);
+        assert(ar2[3] == 1.0);
+        v2 = _mm_set_ps(5.0, 6.0, 7.0, 8.0);
+        v1 = _mm_castps_si128(v2);
+        _mm_store_si128((__m128i *)ar1, v1);
+        assert(ar1[0] == 0x41000000);
+        assert(ar1[1] == 0x40e00000);
+        assert(ar1[2] == 0x40c00000);
+        assert(ar1[3] == 0x40a00000);
+        float w = 0;
+        float z = -278.3;
+        float y = 5.2;
+        float x = -987654321; 
+        v1 = _mm_castps_si128(_mm_set_ps(w, z, y, x));
+        _mm_store_ps(ar2, _mm_castsi128_ps(v1));
+        assert(ar2[0] == x);
+        assert(ar2[1] == y);
+        assert(ar2[2] == z);
+        assert(ar2[3] == w);
+        /*
+        std::bitset<sizeof(float)*CHAR_BIT> bits1x(*reinterpret_cast<unsigned long*>(&(ar2[0])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1y(*reinterpret_cast<unsigned long*>(&(ar2[1])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1z(*reinterpret_cast<unsigned long*>(&(ar2[2])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1w(*reinterpret_cast<unsigned long*>(&(ar2[3])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2x(*reinterpret_cast<unsigned long*>(&x));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2y(*reinterpret_cast<unsigned long*>(&y));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2z(*reinterpret_cast<unsigned long*>(&z));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2w(*reinterpret_cast<unsigned long*>(&w));
+        assert(bits1x == bits2x);
+        assert(bits1y == bits2y);
+        assert(bits1z == bits2z);
+        assert(bits1w == bits2w);
+        */
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0, 0x5555cccc, 0xaaaaaaaa));
+        _mm_store_si128((__m128i *)ar1, _mm_castps_si128(v2));
+        assert(ar1[0] == 0xaaaaaaaa);
+        assert(ar1[1] == 0x5555cccc);
+        assert(ar1[2] == 0);
+        assert(ar1[3] == 0xffffffff);
+    }
+
+    void testConversions() {
+        int32_t __attribute__((__aligned__(16))) ar1[4];
+        float __attribute__((__aligned__(16))) ar2[4];
+        __m128i v1 = _mm_set_epi32(0, -3, -517, 256);
+        __m128 v2 = _mm_cvtepi32_ps(v1);
+        _mm_store_ps(ar2, v2);
+        assert(ar2[0] == 256.0);
+        assert(ar2[1] == -517.0);
+        assert(ar2[2] == -3.0);
+        assert(ar2[3] == 0);
+        v2 = _mm_set_ps(5.0, 6.0, 7.45, -8.0);
+        v1 = _mm_cvtps_epi32(v2);
+        _mm_store_si128((__m128i *)ar1, v1);
+        assert(ar1[0] == -8);
+        assert(ar1[1] == 7);
+        assert(ar1[2] == 6);
+        assert(ar1[3] == 5);
+    }
+
+    void testMoveMaskPs() {
+        __m128 v = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        int mask = _mm_movemask_ps(v);
+        assert(mask == 13);
+    }
+
+    void testAddPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_add_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 41.0);
+        assert(ar[1] == 32.0);
+        assert(ar[2] == 23.0);
+        assert(ar[3] == 14.0);
+    }
+
+    void testSubPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_sub_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == -39.0);
+        assert(ar[1] == -28.0);
+        assert(ar[2] == -17.0);
+        assert(ar[3] == -6.0);
+    }
+
+    void testMulPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_mul_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 40.0);
+        assert(ar[1] == 60.0);
+        assert(ar[2] == 60.0);
+        assert(ar[3] == 40.0);
+    }
+
+    void testDivPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 9.0, 8.0, 1.0);
+        __m128 v2 = _mm_set_ps(2.0, 3.0, 1.0, 0.5);
+        __m128 v = _mm_div_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 2.0);
+        assert(ar[1] == 8.0);
+        assert(ar[2] == 3.0);
+        assert(ar[3] == 2.0);
+    }
+
+    void testMinPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(-20.0, 10.0, 30.0, 0.5);
+        __m128 v2 = _mm_set_ps(2.0, 1.0, 50.0, 0.0);
+        __m128 v = _mm_min_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 0.0);
+        assert(ar[1] == 30.0);
+        assert(ar[2] == 1.0);
+        assert(ar[3] == -20.0);
+    }
+
+    void testMaxPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(-20.0, 10.0, 30.0, 0.5);
+        __m128 v2 = _mm_set_ps(2.5, 5.0, 55.0, 1.0);
+        __m128 v = _mm_max_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 1.0);
+        assert(ar[1] == 55.0);
+        assert(ar[2] == 10.0);
+        assert(ar[3] == 2.5);
+    }
+
+    void testSqrtPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(16.0, 9.0, 4.0, 1.0);
+        __m128 v = _mm_sqrt_ps(v1);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 1.0);
+        assert(ar[1] == 2.0);
+        assert(ar[2] == 3.0);
+        assert(ar[3] == 4.0);
+    }
+
+    void testCmpLtPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmplt_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0xffffffff);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0xffffffff);
+        assert(_mm_movemask_ps(v) == 9);
+    }
+
+    void testCmpLePs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmple_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0xffffffff);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+        assert(_mm_movemask_ps(v) == 13);
+    }
+
+    void testCmpEqPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpeq_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 4);
+    }
+
+    void testCmpGePs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpge_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 6);
+    }
+
+    void testCmpGtPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpgt_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 2);
+    }
+
+    void testAndPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(425, -501, -32, 68);
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        __m128 v = _mm_and_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 68);
+        assert(ar[1] == 0);
+        assert(ar[2] == -501);
+        assert(ar[3] == 425);
+        int32_t __attribute__((__aligned__(16))) ar2[4];
+        v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa));
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555));
+        v = _mm_and_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar2, _mm_castps_si128(v));
+        assert(ar2[0] == 0);
+        assert(ar2[1] == 0);
+        assert(ar2[2] == 0);
+        assert(ar2[3] == 0);
+    }
+
+    void testAndNotPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(425, -501, -32, 68);
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        __m128 v = _mm_andnot_ps(v2, v1);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 0);
+        assert(ar[1] == -32);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+        int32_t __attribute__((__aligned__(16))) ar2[4];
+        v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa));
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555));
+        v = _mm_andnot_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar2, _mm_castps_si128(v));
+        assert(ar2[0] == 0x55555555);
+        assert(ar2[1] == 0x55555555);
+        assert(ar2[2] == 0x55555555);
+        assert(ar2[3] == 0x55555555);
+    }
+
+    void testOrPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0));
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555));
+        __m128 v = _mm_or_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testXorPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0));
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555));
+        __m128 v = _mm_xor_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xaaaaaaaa);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testAndSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555);
+        __m128i v = _mm_and_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testAndNotSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555);
+        __m128i v = _mm_andnot_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0x55555555);
+        assert(ar[2] == 0x55555555);
+        assert(ar[3] == 0x55555555);
+    }
+
+    void testOrSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555);
+        __m128i v = _mm_or_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testXorSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555);
+        __m128i v = _mm_xor_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xaaaaaaaa);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testAddEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(4, 3, 2, 1);
+        __m128i v2 = _mm_set_epi32(10, 20, 30, 40);
+        __m128i v = _mm_add_epi32(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 41);
+        assert(ar[1] == 32);
+        assert(ar[2] == 23);
+        assert(ar[3] == 14);
+    }
+
+    void testSubEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(4, 3, 2, 1);
+        __m128i v2 = _mm_set_epi32(10, 20, 30, 40);
+        __m128i v = _mm_sub_epi32(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == -39);
+        assert(ar[1] == -28);
+        assert(ar[2] == -17);
+        assert(ar[3] == -6);
+    }
+
+    int main(int argc, char ** argv) {
+        testSetPs();
+        testSet1Ps();
+        testSetZeroPs();
+        testSetEpi32();
+        testSet1Epi32();
+        testSetZeroSi128();
+        testBitCasts();
+        testConversions();
+        testMoveMaskPs();
+        testAddPs();
+        testSubPs();
+        testMulPs();
+        testDivPs();
+        testMaxPs();
+        testMinPs();
+        testSqrtPs();
+        testCmpLtPs();
+        testCmpLePs();
+        testCmpEqPs();
+        testCmpGePs();
+        testCmpGtPs();
+        testAndPs();
+        testAndNotPs();
+        testOrPs();
+        testXorPs();
+        testAndSi128();
+        testAndNotSi128();
+        testOrSi128();
+        testXorSi128();
+        testAddEpi32();
+        testSubEpi32();
+        printf("DONE");
+        return 0;
+    }
+    '''
+
+    self.do_run(src, 'DONE')
+
+
   def test_gcc_unmangler(self):
     Settings.NAMED_GLOBALS = 1 # test coverage for this