10 files changed, 296 insertions, 120 deletions
diff --git a/emcc b/emcc
index 90a2ea71..bffe8d5d 100755
--- a/emcc
+++ b/emcc
@@ -522,14 +522,14 @@ if CONFIGURE_CONFIG or CMAKE_CONFIG:
       open(tempout, 'w').write('//\n')
 
   src = None
-  for i in range(len(sys.argv)):
-    if sys.argv[i].endswith('.c'):
+  for arg in sys.argv:
+    if arg.endswith('.c'):
       try:
-        src = open(sys.argv[i]).read()
-        if debug_configure: open(tempout, 'a').write('============= ' + sys.argv[i] + '\n' + src + '\n=============\n\n')
+        src = open(arg).read()
+        if debug_configure: open(tempout, 'a').write('============= ' + arg + '\n' + src + '\n=============\n\n')
       except:
         pass
-    if sys.argv[i].endswith('.s'):
+    if arg.endswith('.s'):
       if debug_configure: open(tempout, 'a').write('(compiling .s assembly, must use clang\n')
       use_js = 0
 
@@ -826,7 +826,7 @@ try:
       newargs[i] = ''
       newargs[i+1] = ''
     elif newargs[i].startswith('--use-preload-cache'):
-      use_preload_cache = True;
+      use_preload_cache = True
       newargs[i] = ''
     elif newargs[i] == '--ignore-dynamic-linking':
       ignore_dynamic_linking = True
@@ -952,7 +952,7 @@ try:
               if not prefix: continue
               if l.startswith(prefix):
                 l = l[len(prefix):]
-                break;
+                break
             libs.append(l)
             newargs[i] = ''
           else:
@@ -1121,9 +1121,8 @@ try:
 
     # Optimize source files
     if llvm_opts > 0:
-      for i in range(len(input_files)):
-        input_file = input_files[i]
-        if input_files[i].endswith(SOURCE_SUFFIXES):
+      for i, input_file in enumerate(input_files):
+        if input_file.endswith(SOURCE_SUFFIXES):
           temp_file = temp_files[i]
           logging.debug('optimizing %s with -O%d' % (input_file, llvm_opts))
           shared.Building.llvm_opt(temp_file, llvm_opts)
@@ -1214,7 +1213,7 @@ try:
         os.path.join('libc', 'gen', 'vwarn.c'),
         os.path.join('libc', 'gen', 'vwarnx.c'),
         os.path.join('libc', 'stdlib', 'strtod.c'),
-      ];
+      ]
       return build_libc('libc.bc', libc_files)
 
     def apply_libc(need):
diff --git a/emscripten.py b/emscripten.py
index df0587f9..d9367566 100755
--- a/emscripten.py
+++ b/emscripten.py
@@ -163,10 +163,10 @@ def emscript(infile, settings, outfile, libraries=[], compiler_engine=None,
     if DEBUG_CACHE and not out:
       dfpath = os.path.join(get_configuration().TEMP_DIR, "ems_" + shortkey)
       dfp = open(dfpath, 'w')
-      dfp.write(pre_input);
-      dfp.write("\n\n========================== settings_text\n\n");
-      dfp.write(settings_text);
-      dfp.write("\n\n========================== libraries\n\n");
+      dfp.write(pre_input)
+      dfp.write("\n\n========================== settings_text\n\n")
+      dfp.write(settings_text)
+      dfp.write("\n\n========================== libraries\n\n")
       dfp.write("\n".join(libraries))
       dfp.close()
       print >>sys.stderr, '  cache miss, key data dumped to %s' % dfpath
@@ -485,7 +485,7 @@ def emscript(infile, settings, outfile, libraries=[], compiler_engine=None,
     global_vars = map(lambda g: g['name'], filter(lambda g: settings['NAMED_GLOBALS'] or g.get('external') or g.get('unIndexable'), forwarded_json['Variables']['globals'].values()))
     global_funcs = ['_' + key for key, value in forwarded_json['Functions']['libraryFunctions'].iteritems() if value != 2]
     def math_fix(g):
-      return g if not g.startswith('Math_') else g.split('_')[1];
+      return g if not g.startswith('Math_') else g.split('_')[1]
     asm_global_funcs = ''.join(['  var ' + g.replace('.', '_') + '=global.' + g + ';\n' for g in maths]) + \
                        ''.join(['  var ' + g + '=env.' + math_fix(g) + ';\n' for g in basic_funcs + global_funcs])
     asm_global_vars = ''.join(['  var ' + g + '=env.' + g + '|0;\n' for g in basic_vars + global_vars]) + \
diff --git a/src/jsifier.js b/src/jsifier.js
index 86931f3e..82b78d0a 100644
--- a/src/jsifier.js
+++ b/src/jsifier.js
@@ -1479,8 +1479,8 @@ function JSify(data, functionsOnly, givenFunctions) {
       return inline.apply(null, args); // Warning: inlining does not prevent recalculation of the arguments. They should be simple identifiers
     }
 
-    if (ASM_JS) {
-      // remove unneeded arguments, which the asm sig can show us. this lets us alias memset with llvm.memset, we just
+    if (ASM_JS && ident.indexOf('llvm_') >= 0) {
+      // remove unneeded arguments in llvm intrinsic functions, which the asm sig can show us. this lets us alias memset with llvm.memset, we just
       // drop the final 2 args so things validate properly in asm
       var libsig = LibraryManager.library[simpleIdent + '__sig'];
       if (libsig) {
diff --git a/src/parseTools.js b/src/parseTools.js
index dfd4b7ed..eb200c65 100644
--- a/src/parseTools.js
+++ b/src/parseTools.js
@@ -2222,9 +2222,9 @@ function processMathop(item) {
     // basic integer ops
     case 'add': return handleOverflow(getFastValue(idents[0], '+', idents[1], item.type), bits);
     case 'sub': return handleOverflow(getFastValue(idents[0], '-', idents[1], item.type), bits);
-    case 'sdiv': case 'udiv': return makeRounding(getFastValue(idents[0], '/', idents[1], item.type), bits, op[0] === 's');
+    case 'sdiv': case 'udiv': return makeRounding(getFastValue(idents[0], '/', idents[1], item.type), bits, true);
     case 'mul': return getFastValue(idents[0], '*', idents[1], item.type); // overflow handling is already done in getFastValue for '*'
-    case 'urem': case 'srem': return makeRounding(getFastValue(idents[0], '%', idents[1], item.type), bits, op[0] === 's');
+    case 'urem': case 'srem': return makeRounding(getFastValue(idents[0], '%', idents[1], item.type), bits, true);
     case 'or': {
       if (bits > 32) {
         assert(bits === 64, 'Too many bits for or: ' + bits);
diff --git a/tests/cases/zeroextarg.ll b/tests/cases/zeroextarg.ll
new file mode 100644
index 00000000..25efb7ec
--- /dev/null
+++ b/tests/cases/zeroextarg.ll
@@ -0,0 +1,22 @@
+; ModuleID = 'tests/hello_world.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux-gnu"
+
+@.str = private unnamed_addr constant [15 x i8] c"hello, world!\0A\00", align 1 ; [#uses=1 type=[15 x i8]*]
+
+define void @glSampleCoverage(float %18, i8 zeroext %invert) {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str, i32 0, i32 0)) ; [#uses=0 type=i32]
+  ret void
+}
+
+; [#uses=0]
+define i32 @main() {
+entry:
+  tail call void @glSampleCoverage(float 3.5, i8 zeroext 12)
+  ret i32 1
+}
+
+; [#uses=1]
+declare i32 @printf(i8*, ...)
+
diff --git a/tests/runner.py b/tests/runner.py
index 7b894110..b21eee08 100755
--- a/tests/runner.py
+++ b/tests/runner.py
@@ -10020,97 +10020,97 @@ finalizing 3 (global == 0)
 ''')
 
   # Generate tests for everything
-  def make_run(fullname, name=-1, compiler=-1, embetter=0, quantum_size=0, typed_arrays=0, emcc_args=None, env='{}'):
-    exec('''
-class %s(T):
-  run_name = '%s'
-  env = %s
+  def make_run(fullname, name=-1, compiler=-1, embetter=0, quantum_size=0,
+      typed_arrays=0, emcc_args=None, env=None):
 
-  def tearDown(self):
-    super(%s, self).tearDown()
+    if env is None: env = {}
 
-    for k, v in self.env.iteritems():
-      del os.environ[k]
+    TT = type(fullname, (T,), dict(run_name = fullname, env = env))
+
+    def tearDown(self):
+      super(TT, self).tearDown()
+
+      for k, v in self.env.iteritems():
+        del os.environ[k]
+
+    TT.tearDown = tearDown
+
+    def setUp(self):
+      super(TT, self).setUp()
+      for k, v in self.env.iteritems():
+        assert k not in os.environ, k + ' should not be in environment'
+        os.environ[k] = v
+
+      global checked_sanity
+      if not checked_sanity:
+        print '(checking sanity from test runner)' # do this after we set env stuff
+        check_sanity(force=True)
+        checked_sanity = True
+
+      Building.COMPILER_TEST_OPTS = ['-g']
+      os.chdir(self.get_dir()) # Ensure the directory exists and go there
+      Building.COMPILER = compiler
+
+      self.emcc_args = None if emcc_args is None else emcc_args[:]
+      if self.emcc_args is not None:
+        Settings.load(self.emcc_args)
+        Building.LLVM_OPTS = 0
+        if '-O2' in self.emcc_args:
+          Building.COMPILER_TEST_OPTS = [] # remove -g in -O2 tests, for more coverage
+        #Building.COMPILER_TEST_OPTS += self.emcc_args
+        for arg in self.emcc_args:
+          if arg.startswith('-O'):
+            Building.COMPILER_TEST_OPTS.append(arg) # so bitcode is optimized too, this is for cpp to ll
+          else:
+            try:
+              key, value = arg.split('=')
+              Settings[key] = value # forward  -s K=V
+            except:
+              pass
+        return
+
+      # TODO: Move much of these to a init() function in shared.py, and reuse that
+      Settings.USE_TYPED_ARRAYS = typed_arrays
+      Settings.INVOKE_RUN = 1
+      Settings.RELOOP = 0 # we only do them in the "o2" pass
+      Settings.MICRO_OPTS = embetter
+      Settings.QUANTUM_SIZE = quantum_size
+      Settings.ASSERTIONS = 1-embetter
+      Settings.SAFE_HEAP = 1-embetter
+      Settings.CHECK_OVERFLOWS = 1-embetter
+      Settings.CORRECT_OVERFLOWS = 1-embetter
+      Settings.CORRECT_SIGNS = 0
+      Settings.CORRECT_ROUNDINGS = 0
+      Settings.CORRECT_OVERFLOWS_LINES = CORRECT_SIGNS_LINES = CORRECT_ROUNDINGS_LINES = SAFE_HEAP_LINES = []
+      Settings.CHECK_SIGNS = 0 #1-embetter
+      Settings.RUNTIME_TYPE_INFO = 0
+      Settings.DISABLE_EXCEPTION_CATCHING = 0
+      Settings.INCLUDE_FULL_LIBRARY = 0
+      Settings.BUILD_AS_SHARED_LIB = 0
+      Settings.RUNTIME_LINKED_LIBS = []
+      Settings.EMULATE_UNALIGNED_ACCESSES = int(Settings.USE_TYPED_ARRAYS == 2 and Building.LLVM_OPTS == 2)
+      Settings.DOUBLE_MODE = 1 if Settings.USE_TYPED_ARRAYS and Building.LLVM_OPTS == 0 else 0
+      Settings.PRECISE_I64_MATH = 0
+      Settings.NAMED_GLOBALS = 0 if not embetter else 1
+
+    TT.setUp = setUp
 
-  def setUp(self):
-    super(%s, self).setUp()
-
-    for k, v in self.env.iteritems():
-      assert k not in os.environ, k + ' should not be in environment'
-      os.environ[k] = v
-
-    global checked_sanity
-    if not checked_sanity:
-      print '(checking sanity from test runner)' # do this after we set env stuff
-      check_sanity(force=True)
-      checked_sanity = True
-
-    Building.COMPILER_TEST_OPTS = ['-g']
-    os.chdir(self.get_dir()) # Ensure the directory exists and go there
-    Building.COMPILER = %r
-
-    self.emcc_args = %s
-    if self.emcc_args is not None:
-      Settings.load(self.emcc_args)
-      Building.LLVM_OPTS = 0
-      if '-O2' in self.emcc_args:
-        Building.COMPILER_TEST_OPTS = [] # remove -g in -O2 tests, for more coverage
-      #Building.COMPILER_TEST_OPTS += self.emcc_args
-      for arg in self.emcc_args:
-        if arg.startswith('-O'):
-          Building.COMPILER_TEST_OPTS.append(arg) # so bitcode is optimized too, this is for cpp to ll
-        else:
-          try:
-            key, value = arg.split('=')
-            Settings[key] = value # forward  -s K=V
-          except:
-            pass
-      return
-
-    embetter = %d
-    quantum_size = %d
-    # TODO: Move much of these to a init() function in shared.py, and reuse that
-    Settings.USE_TYPED_ARRAYS = %d
-    Settings.INVOKE_RUN = 1
-    Settings.RELOOP = 0 # we only do them in the "o2" pass
-    Settings.MICRO_OPTS = embetter
-    Settings.QUANTUM_SIZE = quantum_size
-    Settings.ASSERTIONS = 1-embetter
-    Settings.SAFE_HEAP = 1-embetter
-    Settings.CHECK_OVERFLOWS = 1-embetter
-    Settings.CORRECT_OVERFLOWS = 1-embetter
-    Settings.CORRECT_SIGNS = 0
-    Settings.CORRECT_ROUNDINGS = 0
-    Settings.CORRECT_OVERFLOWS_LINES = CORRECT_SIGNS_LINES = CORRECT_ROUNDINGS_LINES = SAFE_HEAP_LINES = []
-    Settings.CHECK_SIGNS = 0 #1-embetter
-    Settings.RUNTIME_TYPE_INFO = 0
-    Settings.DISABLE_EXCEPTION_CATCHING = 0
-    Settings.INCLUDE_FULL_LIBRARY = 0
-    Settings.BUILD_AS_SHARED_LIB = 0
-    Settings.RUNTIME_LINKED_LIBS = []
-    Settings.EMULATE_UNALIGNED_ACCESSES = int(Settings.USE_TYPED_ARRAYS == 2 and Building.LLVM_OPTS == 2)
-    Settings.DOUBLE_MODE = 1 if Settings.USE_TYPED_ARRAYS and Building.LLVM_OPTS == 0 else 0
-    Settings.PRECISE_I64_MATH = 0
-    Settings.NAMED_GLOBALS = 0 if not embetter else 1
-
-TT = %s
-''' % (fullname, fullname, env, fullname, fullname, compiler, str(emcc_args), embetter, quantum_size, typed_arrays, fullname))
     return TT
 
   # Make one run with the defaults
-  exec('default = make_run("default", compiler=CLANG, emcc_args=[])')
+  default = make_run("default", compiler=CLANG, emcc_args=[])
 
   # Make one run with -O1, with safe heap
-  exec('o1 = make_run("o1", compiler=CLANG, emcc_args=["-O1", "-s", "ASM_JS=0", "-s", "SAFE_HEAP=1"])')
+  o1 = make_run("o1", compiler=CLANG, emcc_args=["-O1", "-s", "ASM_JS=0", "-s", "SAFE_HEAP=1"])
 
   # Make one run with -O2, but without closure (we enable closure in specific tests, otherwise on everything it is too slow)
-  exec('o2 = make_run("o2", compiler=CLANG, emcc_args=["-O2", "-s", "ASM_JS=0", "-s", "JS_CHUNK_SIZE=1024"])')
+  o2 = make_run("o2", compiler=CLANG, emcc_args=["-O2", "-s", "ASM_JS=0", "-s", "JS_CHUNK_SIZE=1024"])
 
   # asm.js
-  exec('asm1 = make_run("asm1", compiler=CLANG, emcc_args=["-O1", "-s", "CHECK_HEAP_ALIGN=1"])')
-  exec('asm2 = make_run("asm2", compiler=CLANG, emcc_args=["-O2"])')
-  exec('asm2g = make_run("asm2g", compiler=CLANG, emcc_args=["-O2", "-g", "-s", "ASSERTIONS=1", "--memory-init-file", "1"])')
-  exec('''asm2x86 = make_run("asm2x86", compiler=CLANG, emcc_args=["-O2", "-g", "-s", "CHECK_HEAP_ALIGN=1"], env='{"EMCC_LLVM_TARGET": "i386-pc-linux-gnu"}')''')
+  asm1 = make_run("asm1", compiler=CLANG, emcc_args=["-O1", "-s", "CHECK_HEAP_ALIGN=1"])
+  asm2 = make_run("asm2", compiler=CLANG, emcc_args=["-O2"])
+  asm2g = make_run("asm2g", compiler=CLANG, emcc_args=["-O2", "-g", "-s", "ASSERTIONS=1", "--memory-init-file", "1"])
+  asm2x86 = make_run("asm2x86", compiler=CLANG, emcc_args=["-O2", "-g", "-s", "CHECK_HEAP_ALIGN=1"], env={"EMCC_LLVM_TARGET": "i386-pc-linux-gnu"})
 
   # Make custom runs with various options
   for compiler, quantum, embetter, typed_arrays in [
@@ -10120,7 +10120,7 @@ TT = %s
     fullname = 's_0_%d%s%s' % (
       embetter, '' if quantum == 4 else '_q' + str(quantum), '' if typed_arrays in [0, 1] else '_t' + str(typed_arrays)
     )
-    exec('%s = make_run(fullname, %r,%r,%d,%d,%d)' % (fullname, fullname, compiler, embetter, quantum, typed_arrays))
+    locals()[fullname] = make_run(fullname, fullname, compiler, embetter, quantum, typed_arrays)
 
   del T # T is just a shape for the specific subclasses, we don't test it itself
 
diff --git a/tools/js-optimizer.js b/tools/js-optimizer.js
index 06d82752..6987511c 100644
--- a/tools/js-optimizer.js
+++ b/tools/js-optimizer.js
@@ -136,6 +136,9 @@ var CONTROL_FLOW = set('do', 'while', 'for', 'if', 'switch');
 var NAME_OR_NUM = set('name', 'num');
 var ASSOCIATIVE_BINARIES = set('+', '*', '|', '&', '^');
 
+var BREAK_CAPTURERS = set('do', 'while', 'for', 'switch');
+var CONTINUE_CAPTURERS = LOOP;
+
 var NULL_NODE = ['name', 'null'];
 var UNDEFINED_NODE = ['unary-prefix', 'void', ['num', 0]];
 var TRUE_NODE = ['unary-prefix', '!', ['num', 0]];
@@ -1531,10 +1534,15 @@ function unVarify(vars, ret) { // transform var x=1, y=2 etc. into (x=1, y=2), i
 var ASM_INT = 0;
 var ASM_DOUBLE = 1;
 
-function detectAsmCoercion(node) {
+function detectAsmCoercion(node, asmInfo) {
   // for params, +x vs x|0, for vars, 0.0 vs 0
   if (node[0] === 'num' && node[1].toString().indexOf('.') >= 0) return ASM_DOUBLE;
-  return node[0] === 'unary-prefix' ? ASM_DOUBLE : ASM_INT;
+  if (node[0] === 'unary-prefix') return ASM_DOUBLE;
+  if (asmInfo && node[0] == 'name') {
+    if (node[1] in asmInfo.vars) return asmInfo.vars[node[1]];
+    if (node[1] in asmInfo.params) return asmInfo.params[node[1]];
+  }
+  return ASM_INT;
 }
 
 function makeAsmParamCoercion(param, type) {
@@ -2974,17 +2982,31 @@ function outline(ast) {
     for (var i = 0; i < stack.length; i++) {
       asmData.stackPos[stack[i]] = i*8;
     }
-
+    // Reserve an extra two spots: one for control flow var, the other for control flow data
+    asmData.stackSize = (stack.length + 2)*8;
+    asmData.controlStackPos = asmData.stackSize - 16;
+    asmData.controlDataStackPos = asmData.stackSize - 8;
     asmData.splitCounter = 0;
   }
 
   // Analyze uses - reads and writes - of variables in part of the AST of a function
   function analyzeCode(func, asmData, ast) {
+    var labels = {};
+    var labelCounter = 1; // 0 means no label
+
+    traverse(ast, function(node, type) {
+      if ((type == 'label' || type in LOOP_FLOW) && node[1] && !(node[1] in labels)) {
+        labels[node[1]] = labelCounter++;
+      }
+    });
+
     var writes = {};
     var appearances = {};
     var hasReturn = false, hasBreak = false, hasContinue = false;
     var breaks = {};    // set of labels we break or continue
     var continues = {}; // to. '0' is an unlabeled one
+    var breakCapturers = 0;
+    var continueCapturers = 0;
 
     traverse(ast, function(node, type) {
       if (type == 'assign' && node[2][0] == 'name') {
@@ -3001,11 +3023,31 @@ function outline(ast) {
       } else if (type == 'return') {
         hasReturn = true;
       } else if (type == 'break') {
-        breaks[node[1] || 0] = 0;
+        var label = node[1] || 0;
+        if (!label && breakCapturers > 0) return; // no label, and captured
+        if (label && (label in labels)) return; // label, and defined in this code, so captured
+        breaks[label || 0] = 0;
         hasBreak = true;
       } else if (type == 'continue') {
-        continues[node[1] || 0] = 0;
+        var label = node[1] || 0;
+        if (!label && continueCapturers > 0) return; // no label, and captured
+        if (label && (label in labels)) return; // label, and defined in this code, so captured
+        continues[label || 0] = 0;
         hasContinue = true;
+      } else {
+        if (type in BREAK_CAPTURERS) {
+          breakCapturers++;
+        }
+        if (type in CONTINUE_CAPTURERS) {
+          continueCapturers++;
+        }
+      }
+    }, function(node, type) {
+      if (type in BREAK_CAPTURERS) {
+        breakCapturers--;
+      }
+      if (type in CONTINUE_CAPTURERS) {
+        continueCapturers--;
       }
     });
 
@@ -3015,9 +3057,26 @@ function outline(ast) {
       if (appearances[name] > 0) reads[name] = 0;
     }
 
-    return { writes: writes, reads: reads, hasReturn: hasReturn, breaks: breaks, continues: continues };
+    return { writes: writes, reads: reads, hasReturn: hasReturn, breaks: breaks, continues: continues, labels: labels };
+  }
+
+  function makeAssign(dst, src) {
+    return ['assign', true, dst, src];
+  }
+  function makeStackAccess(type, pos) { // TODO: float64, not 32
+    return ['sub', ['name', type == ASM_INT ? 'HEAP32' : 'HEAPF32'], ['binary', '>>', ['binary', '+', ['name', 'sp'], ['num', pos]], ['num', '2']]];
+  }
+  function makeIf(cond, then, else_) {
+    var ret = ['if', cond, ['block', then]];
+    if (else_) ret.push(['block', else_]);
+    return ret;
+  }
+  function makeComparison(left, comp, right) {
+    return ['binary', comp, left, right];
   }
 
+  var CONTROL_BREAK = 1, CONTROL_BREAK_LABEL = 2, CONTROL_CONTINUE = 3, CONTROL_CONTINUE_LABEL = 4, CONTROL_RETURN_VOID = 5, CONTROL_RETURN_INT = 6, CONTROL_RETURN_DOUBLE = 7;
+
   var sizeToOutline = extraInfo.sizeToOutline;
   var level = 0;
 
@@ -3025,7 +3084,7 @@ function outline(ast) {
     printErr(' do outline ' + [func[1], level, 'range:', start, end, 'of', stats.length]);
     var code = stats.slice(start, end+1);
     var newIdent = func[1] + '$' + (asmData.splitCounter++);
-    // add spills and reads before and after the call to the outlined code
+    // add spills and reads before and after the call to the outlined code, and in the outlined code itself
     var codeInfo = analyzeCode(func, asmData, code);
     var reps = [];
     for (var v in codeInfo.reads) {
@@ -3039,21 +3098,109 @@ function outline(ast) {
       reps.push(['stat', ['assign', true, ['name', v], ['sub', ['name', getAsmType(asmData, v) == ASM_INT ? 'HEAP32' : 'HEAPF32'], ['binary', '>>', ['binary', '+', ['name', 'sp'], ['num', asmData.stackPos[v]]], ['num', '2']]]]]);
       code.push(['stat', ['assign', true, ['sub', ['name', getAsmType(asmData, v) == ASM_INT ? 'HEAP32' : 'HEAPF32'], ['binary', '>>', ['binary', '+', ['name', 'sp'], ['num', asmData.stackPos[v]]], ['num', '2']]], ['name', v]]]);
     }
-    stats.splice.apply(stats, [start, end-start+1].concat(reps));
     // Generate new function
     if (codeInfo.hasReturn || codeInfo.hasBreak || codeInfo.hasContinue) {
       // we need to capture all control flow using a top-level labeled one-time loop in the outlined function
       code = [['label', 'OL', ['do', ['num', 0], ['block', code]]]];
+      var breakCapturers = 0;
+      var continueCapturers = 0;
+      traverse(code, function(node, type) {
+        // replace all break/continue/returns with code to break out of the main one-time loop, and set the control data
+        if (type == 'return') {
+          var ret = ['break', 'OL'];
+          if (!node[1]) {
+            ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlStackPos), ['num', CONTROL_RETURN_VOID]), ret];
+          } else {
+            var type = detectAsmCoercion(node[1], asmData);
+            ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlStackPos), ['num', type == ASM_INT ? CONTROL_RETURN_INT : CONTROL_RETURN_DOUBLE]), ret];
+            ret = ['seq', makeAssign(makeStackAccess(type, asmData.controlDataStackPos), node[1]), ret];
+          }
+          return ret;
+        } else if (type == 'break') {
+          var label = node[1] || 0;
+          if (label == 'OL') return; // this was just added before us, it is new replacement code
+          if (!label && breakCapturers > 0) return; // no label, and captured
+          if (label && (label in codeInfo.labels)) return; // label, and defined in this code, so captured
+          var ret = ['break', 'OL'];
+          ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlStackPos), ['num', label ? CONTROL_BREAK_LABEL : CONTROL_BREAK]), ret];
+          if (label) {
+            assert(label in codeInfo.labels, label + ' in ' + keys(codeInfo.labels));
+            ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlDataStackPos), ['num', codeInfo.labels[label]]), ret];
+          }
+          return ret;
+        } else if (type == 'continue') {
+          var label = node[1] || 0;
+          if (!label && continueCapturers > 0) return; // no label, and captured
+          if (label && (label in codeInfo.labels)) return; // label, and defined in this code, so captured
+          var ret = ['break', 'OL'];
+          ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlStackPos), ['num', label ? CONTROL_CONTINUE_LABEL : CONTROL_CONTINUE]), ret];
+          if (label) {
+            ret = ['seq', makeAssign(makeStackAccess(ASM_INT, asmData.controlDataStackPos), ['num', codeInfo.labels[label]]), ret];
+          }
+          return ret;
+        } else {
+          if (type in BREAK_CAPTURERS) {
+            breakCapturers++;
+          }
+          if (type in CONTINUE_CAPTURERS) {
+            continueCapturers++;
+          }
+        }
+      }, function(node, type) {
+        if (type in BREAK_CAPTURERS) {
+          breakCapturers--;
+        }
+        if (type in CONTINUE_CAPTURERS) {
+          continueCapturers--;
+        }
+      });
+      // read the control data at the callsite to the outlined function
+      if (codeInfo.hasReturn) {
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_RETURN_VOID]),
+          [['stat', ['return']]]
+        ));
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_RETURN_INT]),
+          [['stat', ['return', makeStackAccess(ASM_INT, asmData.controlDataStackPos)]]]
+        ));
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_RETURN_DOUBLE]),
+          [['stat', ['return', makeStackAccess(ASM_DOUBLE, asmData.controlDataStackPos)]]]
+        ));
+      }
+      if (codeInfo.hasBreak) {
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_BREAK]),
+          ['stat', ['break']]
+        ));
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_BREAK_LABEL]),
+          ['stat', ['break', makeStackAccess(ASM_INT, asmData.controlDataStackPos)]] // XXX here and below, need a switch overall possible labels
+        ));
+      }
+      if (codeInfo.hasContinue) {
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_CONTINUE]),
+          ['stat', ['break']]
+        ));
+        reps.push(makeIf(
+          makeComparison(makeStackAccess(ASM_INT, asmData.controlStackPos), '==', ['num', CONTROL_CONTINUE_LABEL]),
+          ['stat', ['continue', makeStackAccess(ASM_INT, asmData.controlDataStackPos)]] // XXX
+        ));
+      }
     }
     var newFunc = ['defun', newIdent, ['sp'], code];
-    var newAsmInfo = { params: { sp: ASM_INT }, vars: {} };
+    var newAsmData = { params: { sp: ASM_INT }, vars: {} };
     for (var v in codeInfo.reads) {
-      newAsmInfo.vars[v] = getAsmType(asmData, v);
+      newAsmData.vars[v] = getAsmType(asmData, v);
     }
     for (var v in codeInfo.writes) {
-      newAsmInfo.vars[v] = getAsmType(asmData, v);
+      newAsmData.vars[v] = getAsmType(asmData, v);
     }
-    denormalizeAsm(newFunc, newAsmInfo);
+    denormalizeAsm(newFunc, newAsmData);
+    // replace in stats
+    stats.splice.apply(stats, [start, end-start+1].concat(reps));
     return [newFunc];
   }
 
@@ -3138,7 +3285,7 @@ function outline(ast) {
 
     if (newFuncs.length > 0) {
       // We have outlined. Add stack support: header in which we allocate enough stack space TODO
-      // If sp was not present before, add it and before each return, pop the stack TODO
+      // If sp was not present before, add it and before each return, pop the stack. also a final pop if not ending with a return TODO
       // (none of this should be done in inner functions, of course, just the original)
 
       // add new functions to the toplevel, or create a toplevel if there isn't one
@@ -3175,12 +3322,13 @@ function fixDotZero(js) {
   });
 }
 
-function asmLoopOptimizer(ast) {
+function asmLastOpts(ast) {
   traverseGeneratedFunctions(ast, function(fun) {
-    // This is at the end of the pipeline, we can assume all other optimizations are done, and we modify loops
-    // into shapes that might confuse other passes
     traverse(fun, function(node, type) {
       if (type === 'while' && node[1][0] === 'num' && node[1][1] === 1 && node[2][0] === 'block') {
+        // This is at the end of the pipeline, we can assume all other optimizations are done, and we modify loops
+        // into shapes that might confuse other passes
+
         // while (1) { .. if (..) { break } } ==> do { .. } while(..)
         var stats = node[2][1];
         var last = stats[stats.length-1];
@@ -3208,6 +3356,11 @@ function asmLoopOptimizer(ast) {
           node[1] = simplifyNotCompsDirect(['unary-prefix', '!', conditionToBreak]);
           return node;
         }
+      } else if (type == 'binary' && node[1] == '&' && node[3][0] == 'unary-prefix' && node[3][1] == '-' && node[3][2][0] == 'num' && node[3][2][1] == 1) {
+        // Change &-1 into |0, at this point the hint is no longer needed
+        node[1] = '|';
+        node[3] = node[3][2];
+        node[3][1] = 0;
       }
     });
   });
@@ -3266,7 +3419,7 @@ arguments_.slice(1).forEach(function(arg) {
   passes[arg](ast);
 });
 if (asm && last) {
-  asmLoopOptimizer(ast); // TODO: move out of last, to make last faster when done later (as in side modules)
+  asmLastOpts(ast); // TODO: move out of last, to make last faster when done later (as in side modules)
   prepDotZero(ast);
 }
 var js = astToSrc(ast, minifyWhitespace), old;
diff --git a/tools/shared.py b/tools/shared.py
index 46245fd1..0351a736 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -257,7 +257,7 @@ def check_clang_version():
 
 def check_llvm_version():
   try:
-    check_clang_version();
+    check_clang_version()
   except Exception, e:
     logging.warning('Could not verify LLVM version: %s' % str(e))
 
@@ -1196,7 +1196,7 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)''' % { 'winfix': '' if not WINDOWS e
         opts.append('-jump-threading')
         opts.append('-correlated-propagation')
         opts.append('-dse')
-        #addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+        #addExtensionsToPM(EP_ScalarOptimizerLate, MPM)
 
         opts.append('-adce')
         opts.append('-simplifycfg')
@@ -1300,7 +1300,7 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)''' % { 'winfix': '' if not WINDOWS e
           '-O' + str(opt_level), '--closure', '0'], raw)
         f = open(relooper, 'w')
         f.write("// Relooper, (C) 2012 Alon Zakai, MIT license, https://github.com/kripken/Relooper\n")
-        f.write("var Relooper = (function() {\n");
+        f.write("var Relooper = (function() {\n")
         f.write(open(raw).read())
         f.write('\n  return Module.Relooper;\n')
         f.write('})();\n')
diff --git a/tools/test-js-optimizer-asm-last-output.js b/tools/test-js-optimizer-asm-last-output.js
index 0f95d544..f850b18f 100644
--- a/tools/test-js-optimizer-asm-last-output.js
+++ b/tools/test-js-optimizer-asm-last-output.js
@@ -30,6 +30,7 @@ function finall(x) {
  a = -999999984306749400.0;
  a = -999999984306749400.0;
  a = -0xde0b6b000000000;
+ f(g() | 0);
  return 12.0e10;
 }
 function looop() {
diff --git a/tools/test-js-optimizer-asm-last.js b/tools/test-js-optimizer-asm-last.js
index 05e1049e..1d39b1a6 100644
--- a/tools/test-js-optimizer-asm-last.js
+++ b/tools/test-js-optimizer-asm-last.js
@@ -30,6 +30,7 @@ function finall(x) {
  a = +-0xde0b6b000000000;
  a = -+0xde0b6b000000000;
  a = -0xde0b6b000000000;
+ f(g() & -1);
  return +12e10;
 }
 function looop() {