9 files changed, 1406 insertions, 460 deletions
diff --git a/AUTHORS b/AUTHORS
index 2b2b21ed..b03bfe7b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -110,3 +110,4 @@ a license to everyone to use it as detailed in LICENSE.)
 * John Vilk <jvilk@cs.umass.edu>
 * Daniel Baulig <dbaulig@fb.com> (copyright owned by Facebook, Inc.)
 * Lu Wang <coolwanglu@gmail.com>
+* Heidi Pan <heidi.pan@intel.com> (copyright owned by Intel)
diff --git a/src/jsifier.js b/src/jsifier.js
index cb753e57..fb6c5ba8 100644
--- a/src/jsifier.js
+++ b/src/jsifier.js
@@ -1373,8 +1373,9 @@ function JSify(data, functionsOnly, givenFunctions) {
   function insertelementHandler(item) {
     var base = getVectorBaseType(item.type);
     var ident = ensureVector(item.ident, base);
+    var laneOp = ((base == 'float') ? 'SIMD.float32x4.with' : 'SIMD.int32x4.with');
     //return ident + '.with' + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + finalizeLLVMParameter(item.value) + ')';
-    return 'SIMD.with' + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + ident + ',' + finalizeLLVMParameter(item.value) + ')';
+    return laneOp + SIMDLane[finalizeLLVMParameter(item.index)] + '(' + ident + ',' + finalizeLLVMParameter(item.value) + ')';
   }
   function extractelementHandler(item) {
     var base = getVectorBaseType(item.type);
diff --git a/src/library.js b/src/library.js
index 128bb211..faca945c 100644
--- a/src/library.js
+++ b/src/library.js
@@ -8736,8 +8736,72 @@ LibraryManager.library = {
   // emscripten vector ops
   //============================
 
-  emscripten_float32x4_signmask__inline: function(x) {
-    return x + '.signMask()';
+  emscripten_float32x4_signmask__inline: function(a) {
+    return 'SIMD.float32x4.bitsToInt32x4(' + a + ').signMask';
+  },
+  
+  emscripten_float32x4_min__inline: function(a, b) {
+    return 'SIMD.float32x4.min(' + a + ', ' + b + ')';
+  },
+  
+  emscripten_float32x4_max__inline: function(a, b) {
+    return 'SIMD.float32x4.max(' + a + ', ' + b + ')';
+  },
+  
+  emscripten_float32x4_sqrt__inline: function(a) {
+    return 'SIMD.float32x4.sqrt(' + a + ')';
+  },
+  
+  emscripten_float32x4_lessThan__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.lessThan(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_lessThanOrEqual__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.lessThanOrEqual(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_equal__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.equal(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_greaterThanOrEqual__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.greaterThanOrEqual(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_greaterThan__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.float32x4.greaterThan(' + a + ', ' + b + '))';
+  },
+  
+  emscripten_float32x4_and__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.and(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_andNot__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.and(SIMD.int32x4.not(SIMD.float32x4.bitsToInt32x4(' + a + ')), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_or__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.or(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_float32x4_xor__inline: function(a, b) {
+    return 'SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.xor(SIMD.float32x4.bitsToInt32x4(' + a + '), SIMD.float32x4.bitsToInt32x4(' + b + ')))';
+  },
+  
+  emscripten_int32x4_bitsToFloat32x4__inline: function(a) {
+      return 'SIMD.int32x4.bitsToFloat32x4(' + a + ')';
+  },
+  
+  emscripten_int32x4_toFloat32x4__inline: function(a) {
+      return 'SIMD.int32x4.toFloat32x4(' + a + ')';
+  },
+  
+  emscripten_float32x4_bitsToInt32x4__inline: function(a) {
+      return 'SIMD.float32x4.bitsToInt32x4(' + a + ')';
+  },
+  
+  emscripten_float32x4_toInt32x4__inline: function(a) {
+      return 'SIMD.float32x4.toInt32x4(' + a + ')';
   },
 
   //============================
diff --git a/src/parseTools.js b/src/parseTools.js
index 134cb89a..ffd7c758 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -362,7 +362,7 @@ function getVectorNativeType(type) {
 
 function getSIMDName(type) {
   switch (type) {
-    case 'i32': return 'uint';
+    case 'i32': return 'int';
     case 'float': return 'float';
     default: throw 'getSIMDName ' + type;
   }
@@ -2372,29 +2372,28 @@ function processMathop(item) {
     // vector/SIMD operation
     Types.usesSIMD = true;
     switch (op) {
-      case 'fadd': return 'SIMD.add(' + idents[0] + ',' + idents[1] + ')';
-      case 'fsub': return 'SIMD.sub(' + idents[0] + ',' + idents[1] + ')';
-      case 'fmul': return 'SIMD.mul(' + idents[0] + ',' + idents[1] + ')';
-      case 'fdiv': return 'SIMD.div(' + idents[0] + ',' + idents[1] + ')';
-      case 'add' : return 'SIMD.addu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'sub' : return 'SIMD.subu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'mul' : return 'SIMD.mulu32(' + idents[0] + ',' + idents[1] + ')';
-      case 'udiv': return 'SIMD.divu32(' + idents[0] + ',' + idents[1] + ')';
+      case 'fadd': return 'SIMD.float32x4.add(' + idents[0] + ',' + idents[1] + ')';
+      case 'fsub': return 'SIMD.float32x4.sub(' + idents[0] + ',' + idents[1] + ')';
+      case 'fmul': return 'SIMD.float32x4.mul(' + idents[0] + ',' + idents[1] + ')';
+      case 'fdiv': return 'SIMD.float32x4.div(' + idents[0] + ',' + idents[1] + ')';
+      case 'add' : return 'SIMD.int32x4.add(' + idents[0] + ',' + idents[1] + ')';
+      case 'sub' : return 'SIMD.int32x4.sub(' + idents[0] + ',' + idents[1] + ')';
+      case 'mul' : return 'SIMD.int32x4.mul(' + idents[0] + ',' + idents[1] + ')';
       case 'bitcast': {
         var inType = item.params[0].type;
         var outType = item.type;
         if (inType === '<4 x float>') {
           assert(outType === '<4 x i32>');
-          return 'SIMD.float32x4BitsToUint32x4(' + idents[0] + ')';
+          return 'SIMD.float32x4.bitsToInt32x4(' + idents[0] + ')';
         } else {
           assert(inType === '<4 x i32>');
           assert(outType === '<4 x float>');
-          return 'SIMD.uint32x4BitsToFloat32x4(' + idents[0] + ')';
+          return 'SIMD.int32x4.bitsToFloat32x4(' + idents[0] + ')';
         }
       }
-      case 'and': return 'SIMD.and(' + idents[0] + ',' + idents[1] + ')';
-      case 'or': return 'SIMD.or(' + idents[0] + ',' + idents[1] + ')';
-      case 'xor': return 'SIMD.xor(' + idents[0] + ',' + idents[1] + ')';
+      case 'and': return 'SIMD.int32x4.and(' + idents[0] + ',' + idents[1] + ')';
+      case 'or': return 'SIMD.int32x4.or(' + idents[0] + ',' + idents[1] + ')';
+      case 'xor': return 'SIMD.int32x4.xor(' + idents[0] + ',' + idents[1] + ')';
       default: throw 'vector op todo: ' + dump(item);
     }
   }
@@ -2698,7 +2697,7 @@ var simdLane = ['x', 'y', 'z', 'w'];
 
 function ensureVector(ident, base) {
   Types.usesSIMD = true;
-  return ident == 0 ? base + '32x4.zero()' : ident;
+  return ident == 0 ? base + '32x4.splat(0)' : ident;
 }
 
 function ensureValidFFIType(type) {
diff --git a/src/simd.js b/src/simd.js
index bbb12d0a..c7f5ff48 100644
--- a/src/simd.js
+++ b/src/simd.js
@@ -20,8 +20,10 @@
   https://github.com/johnmccutchan/ecmascript_simd/blob/master/src/ecmascript_simd.js
 */
 
+"use strict";
+
 /**
-  * Construct a new instance of a float32x4 number.
+  * Construct a new instance of float32x4 number.
   * @param {double} value used for x lane.
   * @param {double} value used for y lane.
   * @param {double} value used for z lane.
@@ -40,7 +42,7 @@ function float32x4(x, y, z, w) {
 }
 
 /**
-  * Construct a new instance of a float32x4 number with 0.0 in all lanes.
+  * Construct a new instance of float32x4 number with 0.0 in all lanes.
   * @constructor
   */
 float32x4.zero = function() {
@@ -48,7 +50,7 @@ float32x4.zero = function() {
 }
 
 /**
-  * Construct a new instance of a float32x4 number with the same value
+  * Construct a new instance of float32x4 number with the same value
   * in all lanes.
   * @param {double} value used for all lanes.
   * @constructor
@@ -87,18 +89,18 @@ Object.defineProperty(float32x4.prototype, 'signMask', {
 });
 
 /**
-  * Construct a new instance of a uint32x4 number.
+  * Construct a new instance of int32x4 number.
   * @param {integer} 32-bit unsigned value used for x lane.
   * @param {integer} 32-bit unsigned value used for y lane.
   * @param {integer} 32-bit unsigned value used for z lane.
   * @param {integer} 32-bit unsigned value used for w lane.
   * @constructor
   */
-function uint32x4(x, y, z, w) {
-  if (!(this instanceof uint32x4)) {
-    return new uint32x4(x, y, z, w);
+function int32x4(x, y, z, w) {
+  if (!(this instanceof int32x4)) {
+    return new int32x4(x, y, z, w);
   }
-  this.storage_ = new Uint32Array(4);
+  this.storage_ = new Int32Array(4);
   this.storage_[0] = x;
   this.storage_[1] = y;
   this.storage_[2] = z;
@@ -106,7 +108,7 @@ function uint32x4(x, y, z, w) {
 }
 
 /**
-  * Construct a new instance of a uint32x4 number with 0xFFFFFFFF or 0x0 in each
+  * Construct a new instance of int32x4 number with 0xFFFFFFFF or 0x0 in each
   * lane, depending on the truth value in x, y, z, and w.
   * @param {boolean} flag used for x lane.
   * @param {boolean} flag used for y lane.
@@ -114,59 +116,59 @@ function uint32x4(x, y, z, w) {
   * @param {boolean} flag used for w lane.
   * @constructor
   */
-uint32x4.bool = function(x, y, z, w) {
-  return uint32x4(x ? 0xFFFFFFFF : 0x0,
-                  y ? 0xFFFFFFFF : 0x0,
-                  z ? 0xFFFFFFFF : 0x0,
-                  w ? 0xFFFFFFFF : 0x0);
+int32x4.bool = function(x, y, z, w) {
+  return int32x4(x ? -1 : 0x0,
+                  y ? -1 : 0x0,
+                  z ? -1 : 0x0,
+                  w ? -1 : 0x0);
 }
 
 /**
-  * Construct a new instance of a uint32x4 number with the same value
+  * Construct a new instance of int32x4 number with the same value
   * in all lanes.
   * @param {integer} value used for all lanes.
   * @constructor
   */
-uint32x4.splat = function(s) {
-  return uint32x4(s, s, s, s);
+int32x4.splat = function(s) {
+  return int32x4(s, s, s, s);
 }
 
-Object.defineProperty(uint32x4.prototype, 'x', {
+Object.defineProperty(int32x4.prototype, 'x', {
   get: function() { return this.storage_[0]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'y', {
+Object.defineProperty(int32x4.prototype, 'y', {
   get: function() { return this.storage_[1]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'z', {
+Object.defineProperty(int32x4.prototype, 'z', {
   get: function() { return this.storage_[2]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'w',
+Object.defineProperty(int32x4.prototype, 'w',
   { get: function() { return this.storage_[3]; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagX', {
+Object.defineProperty(int32x4.prototype, 'flagX', {
   get: function() { return this.storage_[0] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagY', {
+Object.defineProperty(int32x4.prototype, 'flagY', {
   get: function() { return this.storage_[1] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagZ', {
+Object.defineProperty(int32x4.prototype, 'flagZ', {
   get: function() { return this.storage_[2] != 0x0; }
 });
 
-Object.defineProperty(uint32x4.prototype, 'flagW',
+Object.defineProperty(int32x4.prototype, 'flagW',
   { get: function() { return this.storage_[3] != 0x0; }
 });
 
 /**
   * Extract the sign bit from each lane return them in the first 4 bits.
   */
-Object.defineProperty(uint32x4.prototype, 'signMask', {
+Object.defineProperty(int32x4.prototype, 'signMask', {
   get: function() {
     var mx = (this.storage_[0] & 0x80000000) >>> 31;
     var my = (this.storage_[1] & 0x80000000) >>> 31;
@@ -287,414 +289,580 @@ Float32x4Array.prototype.setAt = function(i, v) {
   this.storage_[i*4+3] = v.w;
 }
 
+
+function Int32x4Array(a, b, c) {
+
+  function isNumber(o) {
+      return typeof o == "number" || (typeof o == "object" && o.constructor === Number);
+  }
+
+  function isTypedArray(o) {
+    return (o instanceof Int8Array) ||
+           (o instanceof Uint8Array) ||
+           (o instanceof Uint8ClampedArray) ||
+           (o instanceof Int16Array) ||
+           (o instanceof Uint16Array) ||
+           (o instanceof Int32Array) ||
+           (o instanceof Uint32Array) ||
+           (o instanceof Float32Array) ||
+           (o instanceof Float64Array) ||
+           (o instanceof Int32x4Array) ||
+           (o instanceof Float32x4Array);
+  }
+
+  function isArrayBuffer(o) {
+    return (o instanceof ArrayBuffer);
+  }
+
+  if (isNumber(a)) {
+    this.storage_ = new Int32Array(a*4);
+    this.length_ = a;
+    this.byteOffset_ = 0;
+    return;
+  } else if (isTypedArray(a)) {
+    if (!(a instanceof Int32x4Array)) {
+      throw "Copying typed array of non-Int32x4Array is unimplemented.";
+    }
+    this.storage_ = new Int32Array(a.length * 4);
+    this.length_ = a.length;
+    this.byteOffset_ = 0;
+    // Copy floats.
+    for (var i = 0; i < a.length*4; i++) {
+      this.storage_[i] = a.storage_[i];
+    }
+  } else if (isArrayBuffer(a)) {
+    if ((b != undefined) && (b % Int32x4Array.BYTES_PER_ELEMENT) != 0) {
+      throw "byteOffset must be a multiple of 16.";
+    }
+    if (c != undefined) {
+      c *= 4;
+      this.storage_ = new Int32Array(a, b, c);
+    }
+    else {
+      // Note: new Int32Array(a, b) is NOT equivalent to new Float32Array(a, b, undefined)
+      this.storage_ = new Int32Array(a, b);
+    }
+    this.length_ = this.storage_.length / 4;
+    this.byteOffset_ = b != undefined ? b : 0;
+  } else {
+    throw "Unknown type of first argument.";
+  }
+}
+
+Object.defineProperty(Int32x4Array.prototype, 'length',
+  { get: function() { return this.length_; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'byteLength',
+  { get: function() { return this.length_ * Int32x4Array.BYTES_PER_ELEMENT; }
+});
+
+Object.defineProperty(Int32x4Array, 'BYTES_PER_ELEMENT',
+  { get: function() { return 16; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'BYTES_PER_ELEMENT',
+  { get: function() { return 16; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'byteOffset',
+  { get: function() { return this.byteOffset_; }
+});
+
+Object.defineProperty(Int32x4Array.prototype, 'buffer',
+  { get: function() { return this.storage_.buffer; }
+});
+
+Int32x4Array.prototype.getAt = function(i) {
+  if (i < 0) {
+    throw "Index must be >= 0.";
+  }
+  if (i >= this.length) {
+    throw "Index out of bounds.";
+  }
+  var x = this.storage_[i*4+0];
+  var y = this.storage_[i*4+1];
+  var z = this.storage_[i*4+2];
+  var w = this.storage_[i*4+3];
+  return float32x4(x, y, z, w);
+}
+
+Int32x4Array.prototype.setAt = function(i, v) {
+  if (i < 0) {
+    throw "Index must be >= 0.";
+  }
+  if (i >= this.length) {
+    throw "Index out of bounds.";
+  }
+  if (!(v instanceof int32x4)) {
+    throw "Value is not a int32x4.";
+  }
+  this.storage_[i*4+0] = v.x;
+  this.storage_[i*4+1] = v.y;
+  this.storage_[i*4+2] = v.z;
+  this.storage_[i*4+3] = v.w;
+}
+
 var SIMD = (function () {
   return {
-    /**
-      * @return {float32x4} New instance of float32x4 with absolute values of
-      * t.
-      */
-    abs: function(t) {
-      return new float32x4(Math.abs(t.x), Math.abs(t.y), Math.abs(t.z),
-                           Math.abs(t.w));
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with negated values of
-      * t.
-      */
-    neg: function(t) {
-      return new float32x4(-t.x, -t.y, -t.z, -t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a + b.
-      */
-    add: function(a, b) {
-      return new float32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a - b.
-      */
-    sub: function(a, b) {
-      return new float32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a * b.
-      */
-    mul: function(a, b) {
-      return new float32x4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with a / b.
-      */
-    div: function(a, b) {
-      return new float32x4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with t's values clamped
-      * between lowerLimit and upperLimit.
-      */
-    clamp: function(t, lowerLimit, upperLimit) {
-      var cx = t.x < lowerLimit.x ? lowerLimit.x : t.x;
-      var cy = t.y < lowerLimit.y ? lowerLimit.y : t.y;
-      var cz = t.z < lowerLimit.z ? lowerLimit.z : t.z;
-      var cw = t.w < lowerLimit.w ? lowerLimit.w : t.w;
-      cx = cx > upperLimit.x ? upperLimit.x : cx;
-      cy = cy > upperLimit.y ? upperLimit.y : cy;
-      cz = cz > upperLimit.z ? upperLimit.z : cz;
-      cw = cw > upperLimit.w ? upperLimit.w : cw;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with the minimum value of
-      * t and other.
-      */
-    min: function(t, other) {
-      var cx = t.x > other.x ? other.x : t.x;
-      var cy = t.y > other.y ? other.y : t.y;
-      var cz = t.z > other.z ? other.z : t.z;
-      var cw = t.w > other.w ? other.w : t.w;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with the maximum value of
-      * t and other.
-      */
-    max: function(t, other) {
-      var cx = t.x < other.x ? other.x : t.x;
-      var cy = t.y < other.y ? other.y : t.y;
-      var cz = t.z < other.z ? other.z : t.z;
-      var cw = t.w < other.w ? other.w : t.w;
-      return new float32x4(cx, cy, cz, cw);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with reciprocal value of
-      * t.
-      */
-    reciprocal: function(t) {
-      return new float32x4(1.0 / t.x, 1.0 / t.y, 1.0 / t.z, 1.0 / t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with square root of the
-      * reciprocal value of t.
-      */
-    reciprocalSqrt: function(t) {
-      return new float32x4(Math.sqrt(1.0 / t.x), Math.sqrt(1.0 / t.y),
-                           Math.sqrt(1.0 / t.z), Math.sqrt(1.0 / t.w));
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with values of t
-      * scaled by s.
-      */
-    scale: function(t, s) {
-      return new float32x4(s * t.x, s * t.y, s * t.z, s * t.w);
-    },
-    /**
-      * @return {float32x4} New instance of float32x4 with square root of
-      * values of t.
-      */
-    sqrt: function(t) {
-      return new float32x4(Math.sqrt(t.x), Math.sqrt(t.y),
-                           Math.sqrt(t.z), Math.sqrt(t.w));
-    },
-    /**
-      * @param {float32x4} t An instance of float32x4 to be shuffled.
-      * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
-      * @return {float32x4} New instance of float32x4 with lanes shuffled.
-      */
-    shuffle: function(t, mask) {
-      var _x = (mask) & 0x3;
-      var _y = (mask >> 2) & 0x3;
-      var _z = (mask >> 4) & 0x3;
-      var _w = (mask >> 6) & 0x3;
-      return new float32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
-                           t.storage_[_w]);
+    float32x4: {
+        /**
+        * @return {float32x4} New instance of float32x4 with absolute values of
+        * t.
+        */
+      abs: function(t) {
+        return new float32x4(Math.abs(t.x), Math.abs(t.y), Math.abs(t.z),
+                             Math.abs(t.w));
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with negated values of
+        * t.
+        */
+      neg: function(t) {
+        return new float32x4(-t.x, -t.y, -t.z, -t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a + b.
+        */
+      add: function(a, b) {
+        return new float32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a - b.
+        */
+      sub: function(a, b) {
+        return new float32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a * b.
+        */
+      mul: function(a, b) {
+        return new float32x4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with a / b.
+        */
+      div: function(a, b) {
+        return new float32x4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with t's values clamped
+        * between lowerLimit and upperLimit.
+        */
+      clamp: function(t, lowerLimit, upperLimit) {
+        var cx = t.x < lowerLimit.x ? lowerLimit.x : t.x;
+        var cy = t.y < lowerLimit.y ? lowerLimit.y : t.y;
+        var cz = t.z < lowerLimit.z ? lowerLimit.z : t.z;
+        var cw = t.w < lowerLimit.w ? lowerLimit.w : t.w;
+        cx = cx > upperLimit.x ? upperLimit.x : cx;
+        cy = cy > upperLimit.y ? upperLimit.y : cy;
+        cz = cz > upperLimit.z ? upperLimit.z : cz;
+        cw = cw > upperLimit.w ? upperLimit.w : cw;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with the minimum value of
+        * t and other.
+        */
+      min: function(t, other) {
+        var cx = t.x > other.x ? other.x : t.x;
+        var cy = t.y > other.y ? other.y : t.y;
+        var cz = t.z > other.z ? other.z : t.z;
+        var cw = t.w > other.w ? other.w : t.w;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with the maximum value of
+        * t and other.
+        */
+      max: function(t, other) {
+        var cx = t.x < other.x ? other.x : t.x;
+        var cy = t.y < other.y ? other.y : t.y;
+        var cz = t.z < other.z ? other.z : t.z;
+        var cw = t.w < other.w ? other.w : t.w;
+        return new float32x4(cx, cy, cz, cw);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with reciprocal value of
+        * t.
+        */
+      reciprocal: function(t) {
+        return new float32x4(1.0 / t.x, 1.0 / t.y, 1.0 / t.z, 1.0 / t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with square root of the
+        * reciprocal value of t.
+        */
+      reciprocalSqrt: function(t) {
+        return new float32x4(Math.sqrt(1.0 / t.x), Math.sqrt(1.0 / t.y),
+                             Math.sqrt(1.0 / t.z), Math.sqrt(1.0 / t.w));
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with values of t
+        * scaled by s.
+        */
+      scale: function(t, s) {
+        return new float32x4(s * t.x, s * t.y, s * t.z, s * t.w);
+      },
+      /**
+        * @return {float32x4} New instance of float32x4 with square root of
+        * values of t.
+        */
+      sqrt: function(t) {
+        return new float32x4(Math.sqrt(t.x), Math.sqrt(t.y),
+                             Math.sqrt(t.z), Math.sqrt(t.w));
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4 to be shuffled.
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {float32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffle: function(t, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new float32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
+                             t.storage_[_w]);
+      },
+      /**
+        * @param {float32x4} t1 An instance of float32x4 to be shuffled. XY lanes in result
+        * @param {float32x4} t2 An instance of float32x4 to be shuffled. ZW lanes in result
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {float32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffleMix: function(t1, t2, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new float32x4(t1.storage_[_x], t1.storage_[_y], t2.storage_[_z],
+                             t2.storage_[_w]);
+      },
+      /**
+        * @param {double} value used for x lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * x replaced with {x}.
+        */
+      withX: function(t, x) {
+        return new float32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * @param {double} value used for y lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * y replaced with {y}.
+        */
+      withY: function(t, y) {
+        return new float32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {double} value used for z lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * z replaced with {z}.
+        */
+      withZ: function(t, z) {
+        return new float32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {double} value used for w lane.
+        * @return {float32x4} New instance of float32x4 with the values in t and
+        * w replaced with {w}.
+        */
+      withW: function(t, w) {
+        return new float32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t < other.
+        */
+      lessThan: function(t, other) {
+        var cx = t.x < other.x;
+        var cy = t.y < other.y;
+        var cz = t.z < other.z;
+        var cw = t.w < other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t <= other.
+        */
+      lessThanOrEqual: function(t, other) {
+        var cx = t.x <= other.x;
+        var cy = t.y <= other.y;
+        var cz = t.z <= other.z;
+        var cw = t.w <= other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t == other.
+        */
+      equal: function(t, other) {
+        var cx = t.x == other.x;
+        var cy = t.y == other.y;
+        var cz = t.z == other.z;
+        var cw = t.w == other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t != other.
+        */
+      notEqual: function(t, other) {
+        var cx = t.x != other.x;
+        var cy = t.y != other.y;
+        var cz = t.z != other.z;
+        var cw = t.w != other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t >= other.
+        */
+      greaterThanOrEqual: function(t, other) {
+        var cx = t.x >= other.x;
+        var cy = t.y >= other.y;
+        var cz = t.z >= other.z;
+        var cw = t.w >= other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @param {float32x4} other An instance of float32x4.
+        * @return {int32x4} 0xFFFFFFFF or 0x0 in each lane depending on
+        * the result of t > other.
+        */
+      greaterThan: function(t, other) {
+        var cx = t.x > other.x;
+        var cy = t.y > other.y;
+        var cz = t.z > other.z;
+        var cw = t.w > other.w;
+        return int32x4.bool(cx, cy, cz, cw);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @return {int32x4} a bit-wise copy of t as a int32x4.
+        */
+      bitsToInt32x4: function(t) {
+        var alias = new Int32Array(t.storage_.buffer);
+        return new int32x4(alias[0], alias[1], alias[2], alias[3]);
+      },
+      /**
+        * @param {float32x4} t An instance of float32x4.
+        * @return {int32x4} with a integer to float conversion of t.
+        */
+      toInt32x4: function(t) {
+        var a = new int32x4(t.storage_[0], t.storage_[1], t.storage_[2],
+                             t.storage_[3]);
+        return a;
+      }
     },
-    /**
-      * @param {double} value used for x lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * x replaced with {x}.
-      */
-    withX: function(t, x) {
-      return new float32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * @param {double} value used for y lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * y replaced with {y}.
-      */
-    withY: function(t, y) {
-      return new float32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {double} value used for z lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * z replaced with {z}.
-      */
-    withZ: function(t, z) {
-      return new float32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {double} value used for w lane.
-      * @return {float32x4} New instance of float32x4 with the values in t and
-      * w replaced with {w}.
-      */
-    withW: function(t, w) {
-      return new float32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t < other.
-      */
-    lessThan: function(t, other) {
-      var cx = t.x < other.x;
-      var cy = t.y < other.y;
-      var cz = t.z < other.z;
-      var cw = t.w < other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t <= other.
-      */
-    lessThanOrEqual: function(t, other) {
-      var cx = t.x <= other.x;
-      var cy = t.y <= other.y;
-      var cz = t.z <= other.z;
-      var cw = t.w <= other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t == other.
-      */
-    equal: function(t, other) {
-      var cx = t.x == other.x;
-      var cy = t.y == other.y;
-      var cz = t.z == other.z;
-      var cw = t.w == other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t != other.
-      */
-    notEqual: function(t, other) {
-      var cx = t.x != other.x;
-      var cy = t.y != other.y;
-      var cz = t.z != other.z;
-      var cw = t.w != other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t >= other.
-      */
-    greaterThanOrEqual: function(t, other) {
-      var cx = t.x >= other.x;
-      var cy = t.y >= other.y;
-      var cz = t.z >= other.z;
-      var cw = t.w >= other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @param {float32x4} other An instance of a float32x4.
-      * @return {uint32x4} 0xFFFFFFFF or 0x0 in each lane depending on
-      * the result of t > other.
-      */
-    greaterThan: function(t, other) {
-      var cx = t.x > other.x;
-      var cy = t.y > other.y;
-      var cz = t.z > other.z;
-      var cw = t.w > other.w;
-      return uint32x4.bool(cx, cy, cz, cw);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a & b.
-      */
-    and: function(a, b) {
-      return new uint32x4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a | b.
-      */
-    or: function(a, b) {
-      return new uint32x4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of a uint32x4.
-      * @param {uint32x4} b An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a ^ b.
-      */
-    xor: function(a, b) {
-      return new uint32x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of ~a
-      */
-    negu32: function(t) {
-      return new uint32x4(~t.x, ~t.y, ~t.z, ~t.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a + b.
-      */
-    addu32: function(a, b) {
-      return new uint32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a - b.
-      */
-    subu32: function(a, b) {
-      return new uint32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-    },
-    /**
-      * @param {uint32x4} a An instance of uint32x4.
-      * @param {uint32x4} b An instance of uint32x4.
-      * @return {uint32x4} New instance of uint32x4 with values of a * b.
-      */
-    mulu32: function(a, b) {
-      return new uint32x4(Math.imul(a.x, b.x), Math.imul(a.y, b.y),
-                          Math.imul(a.z, b.z), Math.imul(a.w, b.w));
-    },
-    /**
-      * @param {float32x4}
-      */
-    select: function(t, trueValue, falseValue) {
-      var tv = SIMD.float32x4BitsToUint32x4(trueValue);
-      var fv = SIMD.float32x4BitsToUint32x4(falseValue);
-      var tr = SIMD.and(t, tv);
-      var fr = SIMD.and(SIMD.negu32(t), fv);
-      return SIMD.uint32x4BitsToFloat32x4(SIMD.or(tr, fr));
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for x lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * x lane replaced with {x}.
-      */
-    withXu32: function(t, x) {
-      return new uint32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for y lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * y lane replaced with {y}.
-      */
-    withYu32: function(t, y) {
-      return new uint32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {integer} 32-bit value used for z lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * z lane replaced with {z}.
-      */
-    withZu32: function(t, z) {
-      return new uint32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {integer} 32-bit value used for w lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * w lane replaced with {w}.
-      */
-    withWu32: function(t, w) {
-      return new uint32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} x flag used for x lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * x lane replaced with {x}.
-      */
-    withFlagX: function(t, flagX) {
-      var x = flagX ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(x, t.y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} y flag used for y lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * y lane replaced with {y}.
-      */
-    withFlagY: function(t, flagY) {
-      var y = flagY ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, y, t.z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} z flag used for z lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * z lane replaced with {z}.
-      */
-    withFlagZ: function(t, flagZ) {
-      var z = flagZ ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, t.y, z, t.w);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @param {boolean} w flag used for w lane.
-      * @return {uint32x4} New instance of uint32x4 with the values in t and
-      * w lane replaced with {w}.
-      */
-    withFlagW: function(t, flagW) {
-      var w = flagW ? 0xFFFFFFFF : 0x0;
-      return new uint32x4(t.x, t.y, t.z, w);
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @return {uint32x4} a bit-wise copy of t as a uint32x4.
-      */
-    float32x4BitsToUint32x4: function(t) {
-      var alias = new Uint32Array(t.storage_.buffer);
-      return new uint32x4(alias[0], alias[1], alias[2], alias[3]);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {float32x4} a bit-wise copy of t as a float32x4.
-      */
-    uint32x4BitsToFloat32x4: function(t) {
-      var alias = new Float32Array(t.storage_.buffer);
-      return new float32x4(alias[0], alias[1], alias[2], alias[3]);
-    },
-    /**
-      * @param {uint32x4} t An instance of a uint32x4.
-      * @return {float32x4} with a float to integer conversion copy of t.
-      */
-    uint32x4ToFloat32x4: function(t) {
-      var a = float32x4.zero();
-      a.storage_[0] = t.storage_[0];
-      a.storage_[1] = t.storage_[1];
-      a.storage_[2] = t.storage_[2];
-      a.storage_[3] = t.storage_[3];
-      return a;
-    },
-    /**
-      * @param {float32x4} t An instance of a float32x4.
-      * @return {uint32x4} with a integer to float conversion of t.
-      */
-    float32x4ToUint32x4: function(t) {
-      var a = new uint32x4(t.storage_[0], t.storage_[1], t.storage_[2],
-                           t.storage_[3]);
-      return a;
+    int32x4: {
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a & b.
+        */
+      and: function(a, b) {
+        return new int32x4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a | b.
+        */
+      or: function(a, b) {
+        return new int32x4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a ^ b.
+        */
+      xor: function(a, b) {
+        return new int32x4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of ~t
+        */
+      not: function(t) {
+        return new int32x4(~t.x, ~t.y, ~t.z, ~t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of -t
+        */
+      neg: function(t) {
+        return new int32x4(-t.x, -t.y, -t.z, -t.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a + b.
+        */
+      add: function(a, b) {
+        return new int32x4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a - b.
+        */
+      sub: function(a, b) {
+        return new int32x4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+      },
+      /**
+        * @param {int32x4} a An instance of int32x4.
+        * @param {int32x4} b An instance of int32x4.
+        * @return {int32x4} New instance of int32x4 with values of a * b.
+        */
+      mul: function(a, b) {
+        return new int32x4(Math.imul(a.x, b.x), Math.imul(a.y, b.y),
+                           Math.imul(a.z, b.z), Math.imul(a.w, b.w));
+      },
+      /**
+        * @param {int32x4} t An instance of float32x4 to be shuffled.
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {int32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffle: function(t, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new int32x4(t.storage_[_x], t.storage_[_y], t.storage_[_z],
+                             t.storage_[_w]);
+      },
+      /**
+        * @param {int32x4} t1 An instance of float32x4 to be shuffled. XY lanes in result
+        * @param {int32x4} t2 An instance of float32x4 to be shuffled. ZW lanes in result
+        * @param {integer} mask One of the 256 shuffle masks, for example, SIMD.XXXX.
+        * @return {int32x4} New instance of float32x4 with lanes shuffled.
+        */
+      shuffleMix: function(t1, t2, mask) {
+        var _x = (mask) & 0x3;
+        var _y = (mask >> 2) & 0x3;
+        var _z = (mask >> 4) & 0x3;
+        var _w = (mask >> 6) & 0x3;
+        return new int32x4(t1.storage_[_x], t1.storage_[_y], t2.storage_[_z],
+                             t2.storage_[_w]);
+      },
+      /**
+        * @param {float32x4}
+        */
+      select: function(t, trueValue, falseValue) {
+        var tv = SIMD.float32x4.bitsToInt32x4(trueValue);
+        var fv = SIMD.float32x4.bitsToInt32x4(falseValue);
+        var tr = SIMD.int32x4.and(t, tv);
+        var fr = SIMD.int32x4.and(SIMD.int32x4.not(t), fv);
+        return SIMD.int32x4.bitsToFloat32x4(SIMD.int32x4.or(tr, fr));
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for x lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * x lane replaced with {x}.
+        */
+      withX: function(t, x) {
+        return new int32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for y lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * y lane replaced with {y}.
+        */
+      withY: function(t, y) {
+        return new int32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {integer} 32-bit value used for z lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * z lane replaced with {z}.
+        */
+      withZ: function(t, z) {
+        return new int32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {integer} 32-bit value used for w lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * w lane replaced with {w}.
+        */
+      withW: function(t, w) {
+        return new int32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} x flag used for x lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * x lane replaced with {x}.
+        */
+      withFlagX: function(t, flagX) {
+        var x = flagX ? 0xFFFFFFFF : 0x0;
+        return new int32x4(x, t.y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} y flag used for y lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * y lane replaced with {y}.
+        */
+      withFlagY: function(t, flagY) {
+        var y = flagY ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, y, t.z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} z flag used for z lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * z lane replaced with {z}.
+        */
+      withFlagZ: function(t, flagZ) {
+        var z = flagZ ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, t.y, z, t.w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @param {boolean} w flag used for w lane.
+        * @return {int32x4} New instance of int32x4 with the values in t and
+        * w lane replaced with {w}.
+        */
+      withFlagW: function(t, flagW) {
+        var w = flagW ? 0xFFFFFFFF : 0x0;
+        return new int32x4(t.x, t.y, t.z, w);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {float32x4} a bit-wise copy of t as a float32x4.
+        */
+      bitsToFloat32x4: function(t) {
+        var alias = new Float32Array(t.storage_.buffer);
+        return new float32x4(alias[0], alias[1], alias[2], alias[3]);
+      },
+      /**
+        * @param {int32x4} t An instance of int32x4.
+        * @return {float32x4} with a float to integer conversion copy of t.
+        */
+      toFloat32x4: function(t) {
+        var a = float32x4.zero();
+        a.storage_[0] = t.storage_[0];
+        a.storage_[1] = t.storage_[1];
+        a.storage_[2] = t.storage_[2];
+        a.storage_[3] = t.storage_[3];
+        return a;
+      }
     }
   }
 })();
@@ -955,4 +1123,3 @@ Object.defineProperty(SIMD, 'WWWX', { get: function() { return 0x3F; } });
 Object.defineProperty(SIMD, 'WWWY', { get: function() { return 0x7F; } });
 Object.defineProperty(SIMD, 'WWWZ', { get: function() { return 0xBF; } });
 Object.defineProperty(SIMD, 'WWWW', { get: function() { return 0xFF; } });
-
diff --git a/system/include/emscripten/emmintrin.h b/system/include/emscripten/emmintrin.h
new file mode 100644
index 00000000..31265db8
--- /dev/null
+++ b/system/include/emscripten/emmintrin.h
@@ -0,0 +1,87 @@
+#include <xmmintrin.h>
+
+typedef int32x4 __m128i;
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_set_epi32(int z, int y, int x, int w)
+{
+  return (__m128i){ w, x, y, z };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_set1_epi32(int w)
+{
+  return (__m128i){ w, w, w, w };
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_setzero_si128()
+{
+  return (__m128i){ 0, 0, 0, 0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_si128(__m128i *p, __m128i a)
+{
+  *p = a;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_and_si128(__m128i a, __m128i b)
+{
+  return a & b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_andnot_si128(__m128i a, __m128i b)
+{
+  return ~a & b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_or_si128(__m128i a, __m128i b)
+{
+  return a | b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_xor_si128(__m128i a, __m128i b)
+{
+  return a ^ b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_add_epi32(__m128i a, __m128i b)
+{
+  return a + b;
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_sub_epi32(__m128i a, __m128i b)
+{
+  return a - b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_castsi128_ps(__m128i a)
+{
+  return emscripten_int32x4_bitsToFloat32x4(a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cvtepi32_ps(__m128i a)
+{
+  return emscripten_int32x4_toFloat32x4(a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_castps_si128(__m128 a)
+{
+  return emscripten_float32x4_bitsToInt32x4(a);
+}
+
+static __inline__ __m128i __attribute__((__always_inline__))
+_mm_cvtps_epi32(__m128 a)
+{
+  return emscripten_float32x4_toInt32x4(a);
+}
+\ No newline at end of file
diff --git a/system/include/emscripten/vector.h b/system/include/emscripten/vector.h
index 938f2369..cf26a5d6 100644
--- a/system/include/emscripten/vector.h
+++ b/system/include/emscripten/vector.h
@@ -2,7 +2,7 @@
 // Support for the JS SIMD API proposal, https://github.com/johnmccutchan/ecmascript_simd
 
 typedef float float32x4 __attribute__((__vector_size__(16)));
-typedef unsigned int uint32x4 __attribute__((__vector_size__(16)));
+typedef unsigned int int32x4 __attribute__((__vector_size__(16)));
 
 #ifdef __cplusplus
 extern "C" {
@@ -10,6 +10,24 @@ extern "C" {
 
 unsigned int emscripten_float32x4_signmask(float32x4 x);
 
+float32x4 emscripten_float32x4_min(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_max(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_sqrt(float32x4 a);
+float32x4 emscripten_float32x4_lessThan(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_lessThanOrEqual(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_equal(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_greaterThanOrEqual(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_greaterThan(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_and(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_andNot(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_or(float32x4 a, float32x4 b);
+float32x4 emscripten_float32x4_xor(float32x4 a, float32x4 b);
+
+float32x4 emscripten_int32x4_bitsToFloat32x4(int32x4 a);
+float32x4 emscripten_int32x4_toFloat32x4(int32x4 a);
+int32x4 emscripten_float32x4_bitsToInt32x4(float32x4 a);
+int32x4 emscripten_float32x4_toInt32x4(float32x4 a);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/system/include/emscripten/xmmintrin.h b/system/include/emscripten/xmmintrin.h
new file mode 100644
index 00000000..1b9108fa
--- /dev/null
+++ b/system/include/emscripten/xmmintrin.h
@@ -0,0 +1,131 @@
+#include <vector.h>
+
+typedef float32x4 __m128;
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_set_ps(float z, float y, float x, float w)
+{
+  return (__m128){ w, x, y, z };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_set1_ps(float w)
+{
+  return (__m128){ w, w, w, w };
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_setzero_ps(void)
+{
+  return (__m128){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline__ void __attribute__((__always_inline__))
+_mm_store_ps(float *p, __m128 a)
+{
+  *(__m128 *)p = a;
+}
+
+static __inline__ int __attribute__((__always_inline__))
+_mm_movemask_ps(__m128 a)
+{
+  return emscripten_float32x4_signmask(a);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_add_ps(__m128 a, __m128 b)
+{
+  return a + b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_sub_ps(__m128 a, __m128 b)
+{
+  return a - b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_mul_ps(__m128 a, __m128 b)
+{
+  return a * b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_div_ps(__m128 a, __m128 b)
+{
+  return a / b;
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_min_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_min(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_max_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_max(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_sqrt_ps(__m128 a)
+{
+  return emscripten_float32x4_sqrt(a);
+}
+
+/* TODO: shuffles */
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmplt_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_lessThan(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmple_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_lessThanOrEqual(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpeq_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_equal(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpge_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_greaterThanOrEqual(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_cmpgt_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_greaterThan(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_and_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_and(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_andnot_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_andNot(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_or_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_or(a, b);
+}
+
+static __inline__ __m128 __attribute__((__always_inline__))
+_mm_xor_ps(__m128 a, __m128 b)
+{
+  return emscripten_float32x4_xor(a, b);
+}
diff --git a/tests/test_core.py b/tests/test_core.py
index cbde794f..ec00c0a5 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -8811,20 +8811,20 @@ int main(int argc, char **argv) {
     printf("zeros %d, %d, %d, %d\n", (int)c[0], (int)c[1], (int)c[2], (int)c[3]);
   }
   {
-    uint32x4 *a = (uint32x4*)&data[0];
-    uint32x4 *b = (uint32x4*)&data[4];
-    uint32x4 c, d, e, f;
+    int32x4 *a = (int32x4*)&data[0];
+    int32x4 *b = (int32x4*)&data[4];
+    int32x4 c, d, e, f;
     c = *a;
     d = *b;
-    printf("4uints! %d, %d, %d, %d   %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], d[0], d[1], d[2], d[3]);
+    printf("4ints! %d, %d, %d, %d   %d, %d, %d, %d\n", c[0], c[1], c[2], c[3], d[0], d[1], d[2], d[3]);
     e = c+d;
     f = c-d;
-    printf("5uints! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
+    printf("5ints! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
     e = c&d;
     f = c|d;
     e = ~c&d;
     f = c^d;
-    printf("5uintops! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
+    printf("5intops! %d, %d, %d, %d   %d, %d, %d, %d\n", e[0], e[1], e[2], e[3], f[0], f[1], f[2], f[3]);
   }
   {
     float32x4 c, d, e, f;
@@ -8842,9 +8842,9 @@ int main(int argc, char **argv) {
 2floats! 48, 68, 92, 120   42, 56, 72, 90
 3floats! 48, 68, 92, 120   2016, 3808, 6624, 10800
 zeros 0, 0, 0, 0
-4uints! 1086324736, 1094713344, 1101004800, 1106247680   1109917696, 1113587712, 1116733440, 1119092736
-5uints! -2098724864, -2086666240, -2077229056, -2069626880   -23592960, -18874368, -15728640, -12845056
-5uintops! 36175872, 35651584, 34603008, 33816576   48758784, 52428800, 53477376, 54788096
+4ints! 1086324736, 1094713344, 1101004800, 1106247680   1109917696, 1113587712, 1116733440, 1119092736
+5ints! -2098724864, -2086666240, -2077229056, -2069626880   -23592960, -18874368, -15728640, -12845056
+5intops! 36175872, 35651584, 34603008, 33816576   48758784, 52428800, 53477376, 54788096
 6floats! -9, 0, 4, 9   -2, -12, 14, 10
 ''')
 
@@ -8895,6 +8895,484 @@ zeros 0, 0, 0, 0
 16.000000
 ''')
 
+  def test_simd3(self):
+    if Settings.USE_TYPED_ARRAYS != 2: return self.skip('needs ta2')
+    if Settings.ASM_JS: Settings.ASM_JS = 2 # does not validate
+    src = r'''
+    #include <iostream>
+    #include <emmintrin.h>
+    #include <assert.h>
+    #include <stdint.h>
+    #include <bitset>
+
+    using namespace std;
+
+    void testSetPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 4.0);
+        assert(ar[1] == 3.0);
+        assert(ar[2] == 2.0);
+        assert(ar[3] == 1.0);
+    }
+
+    void testSet1Ps() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_set1_ps(5.5);
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 5.5);
+        assert(ar[1] == 5.5);
+        assert(ar[2] == 5.5);
+        assert(ar[3] == 5.5);
+    }
+
+    void testSetZeroPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v = _mm_setzero_ps();
+        _mm_store_ps(ar, v);    
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testSetEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_set_epi32(5, 7, 126, 381);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 381);
+        assert(ar[1] == 126);
+        assert(ar[2] == 7);
+        assert(ar[3] == 5);
+        v = _mm_set_epi32(0x55555555, 0xaaaaaaaa, 0xffffffff, 0x12345678);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x12345678);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xaaaaaaaa);
+        assert(ar[3] == 0x55555555);
+    }
+
+    void testSet1Epi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_set1_epi32(-5);
+        _mm_store_si128((__m128i *)ar, v);    
+        assert(ar[0] == -5);
+        assert(ar[1] == -5);
+        assert(ar[2] == -5);
+        assert(ar[3] == -5);
+    }
+
+    void testSetZeroSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v = _mm_setzero_si128();
+        _mm_store_si128((__m128i *)ar, v);    
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testBitCasts() {
+        int32_t __attribute__((__aligned__(16))) ar1[4];
+        float __attribute__((__aligned__(16))) ar2[4];
+        __m128i v1 = _mm_set_epi32(0x3f800000, 0x40000000, 0x40400000, 0x40800000);
+        __m128 v2 = _mm_castsi128_ps(v1);
+        _mm_store_ps(ar2, v2);
+        assert(ar2[0] == 4.0);
+        assert(ar2[1] == 3.0);
+        assert(ar2[2] == 2.0);
+        assert(ar2[3] == 1.0);
+        v2 = _mm_set_ps(5.0, 6.0, 7.0, 8.0);
+        v1 = _mm_castps_si128(v2);
+        _mm_store_si128((__m128i *)ar1, v1);
+        assert(ar1[0] == 0x41000000);
+        assert(ar1[1] == 0x40e00000);
+        assert(ar1[2] == 0x40c00000);
+        assert(ar1[3] == 0x40a00000);
+        float w = 0;
+        float z = -278.3;
+        float y = 5.2;
+        float x = -987654321; 
+        v1 = _mm_castps_si128(_mm_set_ps(w, z, y, x));
+        _mm_store_ps(ar2, _mm_castsi128_ps(v1));
+        assert(ar2[0] == x);
+        assert(ar2[1] == y);
+        assert(ar2[2] == z);
+        assert(ar2[3] == w);
+        /*
+        std::bitset<sizeof(float)*CHAR_BIT> bits1x(*reinterpret_cast<unsigned long*>(&(ar2[0])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1y(*reinterpret_cast<unsigned long*>(&(ar2[1])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1z(*reinterpret_cast<unsigned long*>(&(ar2[2])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits1w(*reinterpret_cast<unsigned long*>(&(ar2[3])));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2x(*reinterpret_cast<unsigned long*>(&x));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2y(*reinterpret_cast<unsigned long*>(&y));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2z(*reinterpret_cast<unsigned long*>(&z));
+        std::bitset<sizeof(float)*CHAR_BIT> bits2w(*reinterpret_cast<unsigned long*>(&w));
+        assert(bits1x == bits2x);
+        assert(bits1y == bits2y);
+        assert(bits1z == bits2z);
+        assert(bits1w == bits2w);
+        */
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0, 0x5555cccc, 0xaaaaaaaa));
+        _mm_store_si128((__m128i *)ar1, _mm_castps_si128(v2));
+        assert(ar1[0] == 0xaaaaaaaa);
+        assert(ar1[1] == 0x5555cccc);
+        assert(ar1[2] == 0);
+        assert(ar1[3] == 0xffffffff);
+    }
+
+    void testConversions() {
+        int32_t __attribute__((__aligned__(16))) ar1[4];
+        float __attribute__((__aligned__(16))) ar2[4];
+        __m128i v1 = _mm_set_epi32(0, -3, -517, 256);
+        __m128 v2 = _mm_cvtepi32_ps(v1);
+        _mm_store_ps(ar2, v2);
+        assert(ar2[0] == 256.0);
+        assert(ar2[1] == -517.0);
+        assert(ar2[2] == -3.0);
+        assert(ar2[3] == 0);
+        v2 = _mm_set_ps(5.0, 6.0, 7.45, -8.0);
+        v1 = _mm_cvtps_epi32(v2);
+        _mm_store_si128((__m128i *)ar1, v1);
+        assert(ar1[0] == -8);
+        assert(ar1[1] == 7);
+        assert(ar1[2] == 6);
+        assert(ar1[3] == 5);
+    }
+
+    void testMoveMaskPs() {
+        __m128 v = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        int mask = _mm_movemask_ps(v);
+        assert(mask == 13);
+    }
+
+    void testAddPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_add_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 41.0);
+        assert(ar[1] == 32.0);
+        assert(ar[2] == 23.0);
+        assert(ar[3] == 14.0);
+    }
+
+    void testSubPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_sub_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == -39.0);
+        assert(ar[1] == -28.0);
+        assert(ar[2] == -17.0);
+        assert(ar[3] == -6.0);
+    }
+
+    void testMulPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+        __m128 v2 = _mm_set_ps(10.0, 20.0, 30.0, 40.0);
+        __m128 v = _mm_mul_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 40.0);
+        assert(ar[1] == 60.0);
+        assert(ar[2] == 60.0);
+        assert(ar[3] == 40.0);
+    }
+
+    void testDivPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(4.0, 9.0, 8.0, 1.0);
+        __m128 v2 = _mm_set_ps(2.0, 3.0, 1.0, 0.5);
+        __m128 v = _mm_div_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 2.0);
+        assert(ar[1] == 8.0);
+        assert(ar[2] == 3.0);
+        assert(ar[3] == 2.0);
+    }
+
+    void testMinPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(-20.0, 10.0, 30.0, 0.5);
+        __m128 v2 = _mm_set_ps(2.0, 1.0, 50.0, 0.0);
+        __m128 v = _mm_min_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 0.0);
+        assert(ar[1] == 30.0);
+        assert(ar[2] == 1.0);
+        assert(ar[3] == -20.0);
+    }
+
+    void testMaxPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(-20.0, 10.0, 30.0, 0.5);
+        __m128 v2 = _mm_set_ps(2.5, 5.0, 55.0, 1.0);
+        __m128 v = _mm_max_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 1.0);
+        assert(ar[1] == 55.0);
+        assert(ar[2] == 10.0);
+        assert(ar[3] == 2.5);
+    }
+
+    void testSqrtPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(16.0, 9.0, 4.0, 1.0);
+        __m128 v = _mm_sqrt_ps(v1);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 1.0);
+        assert(ar[1] == 2.0);
+        assert(ar[2] == 3.0);
+        assert(ar[3] == 4.0);
+    }
+
+    void testCmpLtPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmplt_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0xffffffff);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0xffffffff);
+        assert(_mm_movemask_ps(v) == 9);
+    }
+
+    void testCmpLePs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmple_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0xffffffff);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+        assert(_mm_movemask_ps(v) == 13);
+    }
+
+    void testCmpEqPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpeq_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 4);
+    }
+
+    void testCmpGePs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpge_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 6);
+    }
+
+    void testCmpGtPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(1.0, 2.0, 0.1, 0.001);
+        __m128 v2 = _mm_set_ps(2.0, 2.0, 0.001, 0.1);
+        __m128 v = _mm_cmpgt_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+        assert(_mm_movemask_ps(v) == 2);
+    }
+
+    void testAndPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(425, -501, -32, 68);
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        __m128 v = _mm_and_ps(v1, v2);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 68);
+        assert(ar[1] == 0);
+        assert(ar[2] == -501);
+        assert(ar[3] == 425);
+        int32_t __attribute__((__aligned__(16))) ar2[4];
+        v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa));
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555));
+        v = _mm_and_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar2, _mm_castps_si128(v));
+        assert(ar2[0] == 0);
+        assert(ar2[1] == 0);
+        assert(ar2[2] == 0);
+        assert(ar2[3] == 0);
+    }
+
+    void testAndNotPs() {
+        float __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_set_ps(425, -501, -32, 68);
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0xffffffff, 0xffffffff, 0, 0xffffffff));
+        __m128 v = _mm_andnot_ps(v2, v1);
+        _mm_store_ps(ar, v);
+        assert(ar[0] == 0);
+        assert(ar[1] == -32);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+        int32_t __attribute__((__aligned__(16))) ar2[4];
+        v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa));
+        v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555));
+        v = _mm_andnot_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar2, _mm_castps_si128(v));
+        assert(ar2[0] == 0x55555555);
+        assert(ar2[1] == 0x55555555);
+        assert(ar2[2] == 0x55555555);
+        assert(ar2[3] == 0x55555555);
+    }
+
+    void testOrPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0));
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555));
+        __m128 v = _mm_or_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testXorPs() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128 v1 = _mm_castsi128_ps(_mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0));
+        __m128 v2 = _mm_castsi128_ps(_mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555));
+        __m128 v = _mm_xor_ps(v1, v2);
+        _mm_store_si128((__m128i *)ar, _mm_castps_si128(v));
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xaaaaaaaa);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testAndSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555);
+        __m128i v = _mm_and_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0);
+        assert(ar[1] == 0);
+        assert(ar[2] == 0);
+        assert(ar[3] == 0);
+    }
+
+    void testAndNotSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, -1431655766, 0xaaaaaaaa);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555,  0x55555555, 0x55555555);
+        __m128i v = _mm_andnot_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0x55555555);
+        assert(ar[2] == 0x55555555);
+        assert(ar[3] == 0x55555555);
+    }
+
+    void testOrSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555);
+        __m128i v = _mm_or_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xffffffff);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testXorSi128() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(0xaaaaaaaa, 0xaaaaaaaa, 0xffffffff, 0);
+        __m128i v2 = _mm_set_epi32(0x55555555, 0x55555555, 0x55555555, 0x55555555);
+        __m128i v = _mm_xor_si128(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 0x55555555);
+        assert(ar[1] == 0xaaaaaaaa);
+        assert(ar[2] == 0xffffffff);
+        assert(ar[3] == 0xffffffff);
+    }
+
+    void testAddEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(4, 3, 2, 1);
+        __m128i v2 = _mm_set_epi32(10, 20, 30, 40);
+        __m128i v = _mm_add_epi32(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == 41);
+        assert(ar[1] == 32);
+        assert(ar[2] == 23);
+        assert(ar[3] == 14);
+    }
+
+    void testSubEpi32() {
+        int32_t __attribute__((__aligned__(16))) ar[4];
+        __m128i v1 = _mm_set_epi32(4, 3, 2, 1);
+        __m128i v2 = _mm_set_epi32(10, 20, 30, 40);
+        __m128i v = _mm_sub_epi32(v1, v2);
+        _mm_store_si128((__m128i *)ar, v);
+        assert(ar[0] == -39);
+        assert(ar[1] == -28);
+        assert(ar[2] == -17);
+        assert(ar[3] == -6);
+    }
+
+    int main(int argc, char ** argv) {
+        testSetPs();
+        testSet1Ps();
+        testSetZeroPs();
+        testSetEpi32();
+        testSet1Epi32();
+        testSetZeroSi128();
+        testBitCasts();
+        testConversions();
+        testMoveMaskPs();
+        testAddPs();
+        testSubPs();
+        testMulPs();
+        testDivPs();
+        testMaxPs();
+        testMinPs();
+        testSqrtPs();
+        testCmpLtPs();
+        testCmpLePs();
+        testCmpEqPs();
+        testCmpGePs();
+        testCmpGtPs();
+        testAndPs();
+        testAndNotPs();
+        testOrPs();
+        testXorPs();
+        testAndSi128();
+        testAndNotSi128();
+        testOrSi128();
+        testXorSi128();
+        testAddEpi32();
+        testSubEpi32();
+        printf("DONE");
+        return 0;
+    }
+    '''
+
+    self.do_run(src, 'DONE')
+
+
   def test_gcc_unmangler(self):
     Settings.NAMED_GLOBALS = 1 # test coverage for this