73 files changed, 3502 insertions, 4422 deletions
diff --git a/.gitignore b/.gitignore
index eaaa4ed5..31814a09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,11 @@
 *.pyc
 *~
 *.bc
+src/relooper*.js
 
 # Ignore generated files 
 src/relooper.js
 src/relooper.js.raw.js
+src/relooper/*.o
+src/relooper/*.out
+
diff --git a/AUTHORS b/AUTHORS
index 710ab203..04dfc10d 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -43,4 +43,5 @@ a license to everyone to use it as detailed in LICENSE.)
 * Xuejie Xiao <xxuejie@gmail.com>
 * Dominic Wong <dom@slowbunyip.org>
 * Alan Kligman <alan.kligman@gmail.com> (copyright owned by Mozilla Foundation)
+* Anthony Liot <wolfviking0@yahoo.com>
 
diff --git a/emcc b/emcc
index 87fb2672..4c4c3d97 100755
--- a/emcc
+++ b/emcc
@@ -90,12 +90,14 @@ LLVM_OPT_LEVEL = {
   3: 3,
 }
 
+MEMCPY_ALIASES = ['memcpy', 'llvm.memcpy.i32', 'llvm.memcpy.i64', 'llvm.memcpy.p0i8.p0i8.i32', 'llvm.memcpy.p0i8.p0i8.i64']
+
 DEBUG = int(os.environ.get('EMCC_DEBUG') or 0)
 TEMP_DIR = os.environ.get('EMCC_TEMP_DIR')
 LEAVE_INPUTS_RAW = os.environ.get('EMCC_LEAVE_INPUTS_RAW') # Do not compile .ll files into .bc, just compile them with emscripten directly
                                                            # Not recommended, this is mainly for the test runner, or if you have some other
                                                            # specific need.
-                                                           # One major limitation with this mode is that dlmalloc and libc++ cannot be
+                                                           # One major limitation with this mode is that libc and libc++ cannot be
                                                            # added in. Also, LLVM optimizations will not be done, nor dead code elimination
 AUTODEBUG = os.environ.get('EMCC_AUTODEBUG') # If set to 1, we will run the autodebugger (the automatic debugging tool, see tools/autodebugger).
                                              # Note that this will disable inclusion of libraries. This is useful because including
@@ -338,7 +340,7 @@ Options that are modified or new in %s include:
 
   --clear-cache            Manually clears the cache of compiled
                            emscripten system libraries (libc++,
-                           libc++abi, dlmalloc). This is normally
+                           libc++abi, libc). This is normally
                            handled automatically, but if you update
                            llvm in-place (instead of having a different
                            directory for a new version), the caching
@@ -353,9 +355,9 @@ Options that are modified or new in %s include:
 The target file, if specified (-o <target>), defines what will
 be generated:
 
-  <name>.js                JavaScript (default)
+  <name>.js                JavaScript
   <name>.html              HTML with embedded JavaScript
-  <name>.bc                LLVM bitcode
+  <name>.bc                LLVM bitcode (default)
   <name>.o                 LLVM bitcode (same as .bc)
 
 The -c option (which tells gcc not to run the linker) will
@@ -718,8 +720,6 @@ try:
   if llvm_opts is None: llvm_opts = LLVM_OPT_LEVEL[opt_level]
   if llvm_lto is None: llvm_lto = llvm_opts > 0
   if closure is None: closure = 1 if opt_level >= 2 else 0
-  if minify_whitespace is None:
-    minify_whitespace = closure # if closure is run, minify whitespace
   if opt_level <= 0: keep_debug = True # always keep debug in -O0
 
   if DEBUG: start_time = time.time() # done after parsing arguments, which might affect debug state
@@ -848,9 +848,23 @@ try:
     exec('shared.Settings.' + key + ' = ' + value)
 
   # Apply effects from settings
+  if shared.Settings.ASM_JS:
+    if closure:
+      print >> sys.stderr, 'emcc: warning: disabling closure because it is not compatible with asm.js code generation'
+      closure = False
+    if shared.Settings.CORRECT_SIGNS != 1: 
+      print >> sys.stderr, 'emcc: warning: setting CORRECT_SIGNS to 1 for asm.js code generation'
+      shared.Settings.CORRECT_SIGNS = 1
+    if shared.Settings.CORRECT_OVERFLOWS != 1: 
+      print >> sys.stderr, 'emcc: warning: setting CORRECT_OVERFLOWS to 1 for asm.js code generation'
+      shared.Settings.CORRECT_OVERFLOWS = 1
+
   if shared.Settings.CORRECT_SIGNS >= 2 or shared.Settings.CORRECT_OVERFLOWS >= 2 or shared.Settings.CORRECT_ROUNDINGS >= 2:
     keep_debug = True # must keep debug info to do line-by-line operations 
 
+  if minify_whitespace is None:
+    minify_whitespace = closure # if closure is run, minify whitespace
+
   ## Compile source code to bitcode
 
   if DEBUG: print >> sys.stderr, 'emcc: compiling to bitcode'
@@ -922,16 +936,26 @@ try:
     # Note that we assume a single symbol is enough to know if we have/do not have dlmalloc etc. If you
     # include just a few symbols but want the rest, this will not work.
 
-    # dlmalloc
-    def create_dlmalloc():
-      if DEBUG: print >> sys.stderr, 'emcc: building dlmalloc for cache'
-      execute([shared.PYTHON, shared.EMCC, shared.path_from_root('system', 'lib', 'dlmalloc.c'), '-g', '-o', in_temp('dlmalloc.o')], stdout=stdout, stderr=stderr)
-      # we include the libc++ new stuff here, so that the common case of using just new/delete is quick to link
-      execute([shared.PYTHON, shared.EMXX, shared.path_from_root('system', 'lib', 'libcxx', 'new.cpp'), '-g', '-o', in_temp('new.o')], stdout=stdout, stderr=stderr)
-      shared.Building.link([in_temp('dlmalloc.o'), in_temp('new.o')], in_temp('dlmalloc_full.o'))
-      return in_temp('dlmalloc_full.o')
-    def fix_dlmalloc():
-      # dlmalloc needs some sign correction. # If we are in mode 0, switch to 2. We will add our lines
+    # libc
+    def create_libc():
+      if DEBUG: print >> sys.stderr, 'emcc: building libc for cache'
+      o_s = []
+      for src in ['dlmalloc.c', os.path.join('libc', 'musl', 'memcpy.c'), os.path.join('libcxx', 'new.cpp')]:
+        o = in_temp(os.path.basename(src) + '.o')
+        execute([shared.PYTHON, shared.EMCC, shared.path_from_root('system', 'lib', src), '-o', o], stdout=stdout, stderr=stderr)
+        o_s.append(o)
+      shared.Building.link(o_s, in_temp('libc.bc'))
+      return in_temp('libc.bc')
+
+    def fix_libc(need):
+      # If an intrinsic alias of memcpy is used, we need memcpy
+      for memcpy_alias in MEMCPY_ALIASES:
+        if memcpy_alias in need:
+          if '_memcpy' not in shared.Settings.EXPORTED_FUNCTIONS:
+            shared.Settings.EXPORTED_FUNCTIONS.append('_memcpy')
+            break
+
+      # libc needs some sign correction. # If we are in mode 0, switch to 2. We will add our lines
       try:
         if shared.Settings.CORRECT_SIGNS == 0: raise Exception('we need to change to 2')
       except: # we fail if equal to 0 - so we need to switch to 2 - or if CORRECT_SIGNS is not even in Settings
@@ -942,7 +966,7 @@ try:
       # so all is well anyhow too.
     # XXX We also need to add libc symbols that use malloc, for example strdup. It's very rare to use just them and not
     #     a normal malloc symbol (like free, after calling strdup), so we haven't hit this yet, but it is possible.
-    dlmalloc_symbols = open(shared.path_from_root('system', 'lib', 'dlmalloc.symbols')).read().split('\n')
+    libc_symbols = open(shared.path_from_root('system', 'lib', 'libc.symbols')).read().split('\n')
 
     # libcxx
     def create_libcxx():
@@ -954,13 +978,13 @@ try:
         os.append(o)
       shared.Building.link(os, in_temp('libcxx.bc'))
       return in_temp('libcxx.bc')
-    def fix_libcxx():
+    def fix_libcxx(need):
       assert shared.Settings.QUANTUM_SIZE == 4, 'We do not support libc++ with QUANTUM_SIZE == 1'
       # libcxx might need corrections, so turn them all on. TODO: check which are actually needed
       shared.Settings.CORRECT_SIGNS = shared.Settings.CORRECT_OVERFLOWS = shared.Settings.CORRECT_ROUNDINGS = 1
       #print >> sys.stderr, 'emcc: info: using libcxx turns on CORRECT_* options'
     libcxx_symbols = map(lambda line: line.strip().split(' ')[1], open(shared.path_from_root('system', 'lib', 'libcxx', 'symbols')).readlines())
-    libcxx_symbols = filter(lambda symbol: symbol not in dlmalloc_symbols, libcxx_symbols)
+    libcxx_symbols = filter(lambda symbol: symbol not in libc_symbols, libcxx_symbols)
     libcxx_symbols = set(libcxx_symbols)
 
     # libcxxabi - just for dynamic_cast for now
@@ -973,19 +997,19 @@ try:
         os.append(o)
       shared.Building.link(os, in_temp('libcxxabi.bc'))
       return in_temp('libcxxabi.bc')
-    def fix_libcxxabi():
+    def fix_libcxxabi(need):
       assert shared.Settings.QUANTUM_SIZE == 4, 'We do not support libc++abi with QUANTUM_SIZE == 1'
       #print >> sys.stderr, 'emcc: info: using libcxxabi, this may need CORRECT_* options'
       #shared.Settings.CORRECT_SIGNS = shared.Settings.CORRECT_OVERFLOWS = shared.Settings.CORRECT_ROUNDINGS = 1
     libcxxabi_symbols = map(lambda line: line.strip().split(' ')[1], open(shared.path_from_root('system', 'lib', 'libcxxabi', 'symbols')).readlines())
-    libcxxabi_symbols = filter(lambda symbol: symbol not in dlmalloc_symbols, libcxxabi_symbols)
+    libcxxabi_symbols = filter(lambda symbol: symbol not in libc_symbols, libcxxabi_symbols)
     libcxxabi_symbols = set(libcxxabi_symbols)
 
-    force = False # If we have libcxx, we must force inclusion of dlmalloc, since libcxx uses new internally. Note: this is kind of hacky
+    force = False # If we have libcxx, we must force inclusion of libc, since libcxx uses new internally. Note: this is kind of hacky
 
     for name, create, fix, library_symbols in [('libcxx',    create_libcxx,    fix_libcxx,    libcxx_symbols),
                                                ('libcxxabi', create_libcxxabi, fix_libcxxabi, libcxxabi_symbols),
-                                               ('dlmalloc',  create_dlmalloc,  fix_dlmalloc,  dlmalloc_symbols)]:
+                                               ('libc',      create_libc,      fix_libc,      libc_symbols)]:
       need = set()
       has = set()
       for temp_file in temp_files:
@@ -1014,7 +1038,7 @@ try:
         extra_files_to_link.append(libfile)
         force = True
         if fix:
-          fix()
+          fix(need)
 
   # First, combine the bitcode files if there are several. We must also link if we have a singleton .a
   if len(input_files) + len(extra_files_to_link) > 1 or \
@@ -1065,6 +1089,36 @@ try:
       shared.Building.llvm_opt(in_temp(target_basename + '.bc'), link_opts)
       if DEBUG: save_intermediate('linktime', 'bc')
 
+    # Optimization and lto can add new intrinsics like memcpy that were not present before. We
+    # are now *after* linking in libc, so we missed our chance to get memcpy - check and add it now
+    # if necessary
+    final_symbols = shared.Building.llvm_nm(final)
+    need_memcpy = False
+    for symbol in final_symbols.undefs:
+      if symbol in MEMCPY_ALIASES:
+        need_memcpy = True
+        break
+    has_memcpy = False
+    for symbol in final_symbols.defs:
+      if symbol in MEMCPY_ALIASES:
+        has_memcpy = True
+        break
+    if need_memcpy and not has_memcpy:
+      if DEBUG: print >> sys.stderr, 'memcpy intrinsic added in optimizations, linking in optimized memcpy'
+      memcpy = in_temp('memcpy.bc')
+      force_cxx = os.environ.get('EMMAKEN_CXX')
+      if force_cxx is not None: del os.environ['EMMAKEN_CXX'] # memcpy must be compiled as C
+      execute([shared.PYTHON, shared.EMCC, shared.path_from_root('system', 'lib', 'libc', 'musl', 'memcpy.c'), '-o', memcpy], stdout=stdout, stderr=stderr)
+      if force_cxx is not None: os.environ['EMMAKEN_CXX'] = force_cxx
+      shared.Building.llvm_opt(memcpy, llvm_opts) # optimize it just like normal code; no point in lto though
+      next = final + '.postrinsics.bc'
+      shared.Building.link([final, memcpy], next)
+      final = next
+      if shared.Settings.ASM_JS: # export it so other library functions etc. can use it
+        if '_memcpy' not in shared.Settings.EXPORTED_FUNCTIONS:
+          shared.Settings.EXPORTED_FUNCTIONS.append('_memcpy')
+      if DEBUG: save_intermediate('postrinsics', 'bc')
+
   # Prepare .ll for Emscripten
   if not LEAVE_INPUTS_RAW:
     final = shared.Building.llvm_dis(final, final + '.ll')
@@ -1131,6 +1185,17 @@ try:
     execute(shlex.split(js_transform, posix=posix) + [os.path.abspath(final)])
     if DEBUG: save_intermediate('transformed')
 
+  if shared.Settings.ASM_JS: # XXX temporary wrapping for testing purposes
+    print >> sys.stderr, 'emcc: ASM_JS mode is highly experimental, and will not work on most codebases yet. It is NOT recommended that you try this yet.' # XXX TODO: 0.0 instead of +0 for local var defs
+    unwrapped = open(final).read()
+    final += '.asmwrap.js'
+    open(final, 'w').write('''
+(function() { // prevent new Function from seeing the global scope
+%s
+}).apply(null, arguments);
+''' % unwrapped)
+    if DEBUG: save_intermediate('asmwrap')
+
   # It is useful to run several js optimizer passes together, to save on unneeded unparsing/reparsing
   js_optimizer_queue = []
   def flush_js_optimizer_queue():
@@ -1156,11 +1221,21 @@ try:
       if DEBUG: save_intermediate('pretty')
 
     def get_eliminate():
-      return 'eliminate' if not shared.Settings.ALLOW_MEMORY_GROWTH else 'eliminateMemSafe'
+      if shared.Settings.ASM_JS:
+        return 'eliminateAsm'
+      elif shared.Settings.ALLOW_MEMORY_GROWTH:
+        return 'eliminateMemSafe'
+      else:
+        return 'eliminate'
+
+    def get_simplify_pre():
+      if shared.Settings.ASM_JS:
+        return 'simplifyExpressionsPreAsm'
+      else:
+        return 'simplifyExpressionsPre'
 
-    js_optimizer_queue += [get_eliminate()]
+    js_optimizer_queue += [get_eliminate(), get_simplify_pre()]
 
-    js_optimizer_queue += ['simplifyExpressionsPre']
     if shared.Settings.RELOOP:
       js_optimizer_queue += ['optimizeShiftsAggressive', get_eliminate()] # aggressive shifts optimization requires loops, it breaks on switches
 
@@ -1170,6 +1245,8 @@ try:
     if DEBUG: print >> sys.stderr, 'emcc: running closure'
     final = shared.Building.closure_compiler(final)
     if DEBUG: save_intermediate('closure')
+  elif shared.Settings.ASM_JS and shared.Settings.RELOOP:
+    js_optimizer_queue += ['registerizeAsm'] # we can't use closure in asm, but this does much of the same
 
   if opt_level >= 1:
     if DEBUG: print >> sys.stderr, 'emcc: running post-closure post-opts'
diff --git a/emscripten.py b/emscripten.py
index 3c636447..ac13f7a3 100755
--- a/emscripten.py
+++ b/emscripten.py
@@ -129,10 +129,13 @@ def emscript(infile, settings, outfile, libraries=[]):
 
   # Save settings to a file to work around v8 issue 1579
   settings_file = temp_files.get('.txt').name
-  settings_text = json.dumps(settings)
-  s = open(settings_file, 'w')
-  s.write(settings_text)
-  s.close()
+  def save_settings():
+    global settings_text
+    settings_text = json.dumps(settings)
+    s = open(settings_file, 'w')
+    s.write(settings_text)
+    s.close()
+  save_settings()
 
   # Phase 1 - pre
   if DEBUG: t = time.time()
@@ -170,6 +173,9 @@ def emscript(infile, settings, outfile, libraries=[]):
   if DEBUG: t = time.time()
   forwarded_json = json.loads(forwarded_data)
   indexed_functions = set()
+  if settings.get('ASM_JS'):
+    settings['EXPORTED_FUNCTIONS'] = forwarded_json['EXPORTED_FUNCTIONS']
+    save_settings()
 
   chunks = shared.JCache.chunkify(funcs, chunk_size, 'emscript_files' if jcache else None)
 
@@ -223,16 +229,27 @@ def emscript(infile, settings, outfile, libraries=[]):
   if DEBUG: print >> sys.stderr, '  emscript: phase 2 took %s seconds' % (time.time() - t)
   if DEBUG: t = time.time()
 
-  funcs_js = ''.join([output[0] for output in outputs])
-
+  # merge forwarded data
+  if settings.get('ASM_JS'):
+    all_exported_functions = set(settings['EXPORTED_FUNCTIONS']) # both asm.js and otherwise
+    for additional_export in ['_malloc', '_free']: # additional functions to export from asm, if they are implemented
+      all_exported_functions.add(additional_export)
+    exported_implemented_functions = set()
   for func_js, curr_forwarded_data in outputs:
-    # merge forwarded data
     curr_forwarded_json = json.loads(curr_forwarded_data)
     forwarded_json['Types']['preciseI64MathUsed'] = forwarded_json['Types']['preciseI64MathUsed'] or curr_forwarded_json['Types']['preciseI64MathUsed']
     for key, value in curr_forwarded_json['Functions']['blockAddresses'].iteritems():
       forwarded_json['Functions']['blockAddresses'][key] = value
     for key in curr_forwarded_json['Functions']['indexedFunctions'].iterkeys():
       indexed_functions.add(key)
+    if settings.get('ASM_JS'):
+      for key in curr_forwarded_json['Functions']['implementedFunctions'].iterkeys():
+        if key in all_exported_functions: exported_implemented_functions.add(key)
+    for key, value in curr_forwarded_json['Functions']['unimplementedFunctions'].iteritems():
+      forwarded_json['Functions']['unimplementedFunctions'][key] = value
+
+  funcs_js = ''.join([output[0] for output in outputs])
+
   outputs = None
   if DEBUG: print >> sys.stderr, '  emscript: phase 2b took %s seconds' % (time.time() - t)
   if DEBUG: t = time.time()
@@ -241,6 +258,7 @@ def emscript(infile, settings, outfile, libraries=[]):
   forwarded_json['Functions']['indexedFunctions'] = {}
   i = 2
   for indexed in indexed_functions:
+    #print >> sys.stderr, 'indaxx', indexed, i
     forwarded_json['Functions']['indexedFunctions'][indexed] = i # make sure not to modify this python object later - we use it in indexize
     i += 2
   forwarded_json['Functions']['nextIndex'] = i
@@ -258,8 +276,6 @@ def emscript(infile, settings, outfile, libraries=[]):
   pre = None
 
   #if DEBUG: outfile.write('// funcs\n')
-  outfile.write(blockaddrsize(indexize(funcs_js)))
-  funcs_js = None
 
   # forward
   forwarded_data = json.dumps(forwarded_json)
@@ -272,8 +288,163 @@ def emscript(infile, settings, outfile, libraries=[]):
   post_file = temp_files.get('.post.ll').name
   open(post_file, 'w').write('\n') # no input, just processing of forwarded data
   out = shared.run_js(compiler, shared.COMPILER_ENGINE, [settings_file, post_file, 'post', forwarded_file] + libraries, stdout=subprocess.PIPE, cwd=path_from_root('src'))
-  #if DEBUG: outfile.write('// post\n')
-  outfile.write(indexize(out))
+  post, last_forwarded_data = out.split('//FORWARDED_DATA:')
+  last_forwarded_json = json.loads(last_forwarded_data)
+
+  if settings.get('ASM_JS'):
+    simple = os.environ.get('EMCC_SIMPLE_ASM')
+    class Counter:
+      i = 0
+    def make_table(sig, raw):
+      i = Counter.i
+      Counter.i += 1
+      bad = 'b' + str(i)
+      params = ','.join(['p%d' % p for p in range(len(sig)-1)])
+      coercions = ';'.join(['p%d = %sp%d%s' % (p, '+' if sig[p+1] == 'd' else '', p, '' if sig[p+1] == 'd' else '|0') for p in range(len(sig)-1)]) + ';'
+      ret = '' if sig[0] == 'v' else ('return %s0' % ('+' if sig[0] == 'd' else ''))
+      return 'function %s(%s) { %s abort(%d); %s };\n' % (bad, params, coercions, i, ret) + raw.replace('[0,', '[' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0,', ',' + bad + ',').replace(',0]', ',' + bad + ']').replace(',0]', ',' + bad + ']')
+    function_tables_defs = '\n'.join([make_table(sig, raw) for sig, raw in last_forwarded_json['Functions']['tables'].iteritems()])
+
+    maths = ['Runtime.bitshift64', 'Math.floor', 'Math.min', 'Math.abs', 'Math.sqrt', 'Math.pow', 'Math.cos', 'Math.sin', 'Math.tan', 'Math.acos', 'Math.asin', 'Math.atan', 'Math.atan2', 'Math.exp', 'Math.log', 'Math.ceil']
+
+    if settings['USE_MATH_IMUL']:
+      maths += ['Math.imul']
+    asm_setup = '\n'.join(['var %s = %s;' % (f.replace('.', '_'), f) for f in