5 files changed, 42 insertions, 2 deletions
diff --git a/emcc b/emcc
index 07cf94d1..bb1a2496 100755
--- a/emcc
+++ b/emcc
@@ -140,7 +140,12 @@ Options that are modified or new in %s include:
                            optimizations, and no runtime assertions
                            or C++ exception catching (to re-enable
                            C++ exception catching, use
-                           -s DISABLE_EXCEPTION_CATCHING=0 ).
+                           -s DISABLE_EXCEPTION_CATCHING=0 ). 32-bit
+                           multiplication is done in JS doubles which
+                           is fast but imprecise for high values.
+                           (For details on the affects of different
+                           opt levels, see apply_opt_level() in
+                           tools/shared.py)
                            Note: Optimizations are only done when
                            compiling to JavaScript, not to intermediate
                            bitcode.
diff --git a/src/parseTools.js b/src/parseTools.js
index 86e3c643..e37f3a99 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -1794,7 +1794,14 @@ function processMathop(item) {
     case 'add': return handleOverflow(getFastValue(idents[0], '+', idents[1], item.type), bits);
     case 'sub': return handleOverflow(getFastValue(idents[0], '-', idents[1], item.type), bits);
     case 'sdiv': case 'udiv': return makeRounding(getFastValue(idents[0], '/', idents[1], item.type), bits, op[0] === 's');
-    case 'mul': return handleOverflow(getFastValue(idents[0], '*', idents[1], item.type), bits);
+    case 'mul': {
+      if (bits == 32 && PRECISE_I32_MUL) {
+        preciseI64MathUsed = true;
+        return '(i64Math.multiply(' + idents[0] + ',0,' + idents[1] + ',0),i64Math.result[0])';
+      } else {
+        return handleOverflow(getFastValue(idents[0], '*', idents[1], item.type), bits);
+      }
+    }
     case 'urem': case 'srem': return getFastValue(idents[0], '%', idents[1], item.type);
     case 'or': {
       if (bits > 32) {
diff --git a/src/settings.js b/src/settings.js
index 75b30003..110cc246 100644
--- a/src/settings.js
+++ b/src/settings.js
@@ -76,6 +76,16 @@ var DOUBLE_MODE = 1; // How to load and store 64-bit doubles. Without typed arra
                      // NaN or an infinite number.
 var PRECISE_I64_MATH = 1; // If enabled, i64 addition etc. is emulated - which is slow but precise. If disabled,
                           // we use the 'double trick' which is fast but incurs rounding at high values.
+                          // Note that we do not catch 32-bit multiplication by default (which must be done in
+                          // 64 bits for high values for full precision) - you must manually set PRECISE_I32_MUL
+                          // for that.
+var PRECISE_I32_MUL = 1; // If enabled, i64 math is done in i32 multiplication. This is necessary if the values
+                         // exceed the JS double-integer limit of ~52 bits. This option can normally be disabled
+                         // because generally i32 multiplication works ok without it, and enabling it has a big
+                         // impact on performance.
+                         // Note that you can hand-optimize your code to avoid the need for this: If you do
+                         // multiplications that actually need 64-bit precision inside 64-bit values, things
+                         // will work properly. (Unless the LLVM optimizer turns them into 32-bit values?)
 
 var CLOSURE_ANNOTATIONS = 0; // If set, the generated code will be annotated for the closure
                              // compiler. This potentially lets closure optimize the code better.
diff --git a/tests/runner.py b/tests/runner.py
index 876990d2..32776167 100755
--- a/tests/runner.py
+++ b/tests/runner.py
@@ -953,6 +953,23 @@ m_divisor is 1091269979
       '''
       self.do_run(src, 'zero 2, 104', ['hallo'])
 
+    def test_i32_mul_precise(self):
+      if self.emcc_args == None: return self.skip('needs ta2')
+
+      self.emcc_args += ['-s', 'PRECISE_I32_MUL=1']
+      src = r'''
+        #include <stdio.h>
+
+        int main(int argc, char **argv) {
+          unsigned long d1 = 0x847c9b5d;
+          unsigned long q =  0x549530e1;
+          if (argc > 1000) { q += argc; d1 -= argc; } // confuse optimizer
+          printf("%lu\n", d1*q);
+          return 0;
+        }
+      '''
+      self.do_run(src, '3217489085')
+
     def test_i16_emcc_intrinsic(self):
       Settings.CORRECT_SIGNS = 1 # Relevant to this test
 
diff --git a/tools/shared.py b/tools/shared.py
index 672a1a18..57541077 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -413,6 +413,7 @@ class Settings:
         if opt_level >= 1:
           Settings.ASSERTIONS = 0
           Settings.DISABLE_EXCEPTION_CATCHING = 1
+          Settings.PRECISE_I32_MUL = 0
         if opt_level >= 2:
           Settings.RELOOP = 1
         if opt_level >= 3: