Use typed arrays instead of normal JS arrays to back files in the MEMFS filesystem for improved performance, especially when backing to IDBFS.

author: Jukka Jylänki <jujjyl@gmail.com> 2014-06-05 02:00:08 +0300
committer: Jukka Jylänki <jujjyl@gmail.com> 2014-06-05 05:54:04 +0300
commit: 92cab32f5ffcc00779b521588bec62f597c98aa7 (patch)
tree: a7c248ecf7a34dac50f326e62f8c398ec07ab2c6
parent: 3431eab292e0b9e2359b183fa2c954eb7e5cf7e7 (diff)
4 files changed, 136 insertions, 56 deletions
diff --git a/src/library_fs.js b/src/library_fs.js
index 5f7f1dea..1fff6348 100644
--- a/src/library_fs.js
+++ b/src/library_fs.js
@@ -1480,6 +1480,7 @@ mergeInto(LibraryManager.library, {
           // WARNING: Can't read binary files in V8's d8 or tracemonkey's js, as
           //          read() will try to parse UTF8.
           obj.contents = intArrayFromString(Module['read'](obj.url), true);
+          obj.usedBytes = obj.contents.length;
         } catch (e) {
           success = false;
         }
@@ -1601,6 +1602,10 @@ mergeInto(LibraryManager.library, {
         node.contents = null;
         node.url = properties.url;
       }
+      // Add a function that defers querying the file size until it is asked the first time.
+      Object.defineProperty(node, "usedBytes", {
+          get: function() { return this.contents.length; }
+      });
       // override each stream op with one that tries to force load the lazy file first
       var stream_ops = {};
       var keys = Object.keys(node.stream_ops);
diff --git a/src/library_memfs.js b/src/library_memfs.js
index 95c3ae65..4b56ebbb 100644
--- a/src/library_memfs.js
+++ b/src/library_memfs.js
@@ -2,11 +2,6 @@ mergeInto(LibraryManager.library, {
   $MEMFS__deps: ['$FS'],
   $MEMFS: {
     ops_table: null,
-
-    // content modes
-    CONTENT_OWNING: 1, // contains a subarray into the heap, and we own it, without copying (note: someone else needs to free() it, if that is necessary)
-    CONTENT_FLEXIBLE: 2, // has been modified or never set to anything, and is a flexible js array that can grow/shrink
-    CONTENT_FIXED: 3, // contains some fixed-size content written into it, in a typed array
     mount: function(mount) {
       return MEMFS.createNode(null, '/', {{{ cDefine('S_IFDIR') }}} | 511 /* 0777 */, 0);
     },
@@ -71,8 +66,11 @@ mergeInto(LibraryManager.library, {
       } else if (FS.isFile(node.mode)) {
         node.node_ops = MEMFS.ops_table.file.node;
         node.stream_ops = MEMFS.ops_table.file.stream;
-        node.contents = [];
-        node.contentMode = MEMFS.CONTENT_FLEXIBLE;
+        node.usedBytes = 0; // The actual number of bytes used in the typed array, as opposed to contents.buffer.byteLength which gives the whole capacity.
+        // When the byte data of the file is populated, this will point to either a typed array, or a normal JS array. Typed arrays are preferred
+        // for performance, and used by default. However, typed arrays are not resizable like normal JS arrays are, so there is a small disk size
+        // penalty involved for appending file writes that continuously grow a file similar to std::vector capacity vs used -scheme.
+        node.contents = null; 
       } else if (FS.isLink(node.mode)) {
         node.node_ops = MEMFS.ops_table.link.node;
         node.stream_ops = MEMFS.ops_table.link.stream;
@@ -87,13 +85,80 @@ mergeInto(LibraryManager.library, {
       }
       return node;
     },
-    ensureFlexible: function(node) {
-      if (node.contentMode !== MEMFS.CONTENT_FLEXIBLE) {
-        var contents = node.contents;
-        node.contents = Array.prototype.slice.call(contents);
-        node.contentMode = MEMFS.CONTENT_FLEXIBLE;
+
+    // Given a file node, returns its file data converted to a regular JS array. You should treat this as read-only.
+    getFileDataAsRegularArray: function(node) {
+#if USE_TYPED_ARRAYS == 2
+      if (node.contents && node.contents.subarray) {
+        var arr = [];
+        for(var i = 0; i < node.usedBytes; ++i) arr.push(node.contents[i]);
+        return arr; // Returns a copy of the original data.
+      }
+#endif
+      return node.contents; // No-op, the file contents are already in a JS array. Return as-is.
+    },
+
+    // Allocates a new backing store for the given node so that it can fit at least newSize amount of bytes.
+    // May allocate more, to provide automatic geometric increase and amortized linear performance appending writes.
+    // Never shrinks the storage.
+    expandFileStorage: function(node, newCapacity) {
+#if USE_TYPED_ARRAYS == 2
+
+#if !MEMFS_APPEND_TO_TYPED_ARRAYS
+      // If we are asked to expand the size of a file that already exists, revert to using a standard JS array to store the file
+      // instead of a typed array. This makes resizing the array more flexible because we can just .push() elements at the back to
+      // increase the size.
+      if (node.contents && node.contents.subarray && newCapacity > node.contents.buffer.byteLength) {
+        node.contents = MEMFS.getFileDataAsRegularArray(node);
+        node.usedBytes = node.contents.length;
+      }
+#endif
+
+      if (!node.contents || node.contents.subarray) { // Keep using a typed array if creating a new storage, or if old one was a typed array as well.
+        var prevCapacity = node.contents ? node.contents.buffer.byteLength : 0;
+        if (prevCapacity >= newCapacity) return; // No need to expand, the storage was already large enough.
+        // Don't expand strictly to the given requested limit if it's only a very small increase, but instead geometrically grow capacity.
+        // For small filesizes (<1MB), perform size*2 geometric increase, but for large sizes, do a much more conservative size*1.125 increase to
+        // avoid overshooting the allocation cap by a very large margin.
+        var CAPACITY_DOUBLING_MAX = 1024 * 1024;
+        newCapacity = Math.max(newCapacity, (prevCapacity * (prevCapacity < CAPACITY_DOUBLING_MAX ? 2.0 : 1.125)) | 0);
+        if (prevCapacity != 0) newCapacity = Math.max(newCapacity, 256); // At minimum allocate 256b for each file when expanding.
+        var oldContents = node.contents;
+        node.contents = new Uint8Array(new ArrayBuffer(newCapacity)); // Allocate new storage.
+        if (node.usedBytes > 0) node.contents.set(oldContents.subarray(0, node.usedBytes), 0); // Copy old data over to the new storage.
+        return;
       }
+#endif
+      // Not using a typed array to back the file storage. Use a standard JS array instead.
+      if (!node.contents && newCapacity > 0) node.contents = [];
+      while (node.contents.length < newCapacity) node.contents.push(0);
     },
+
+    // Performs an exact resize of the backing file storage to the given size, if the size is not exactly this, the storage is fully reallocated.
+    resizeFileStorage: function(node, newSize) {
+      if (node.usedBytes == newSize) return;
+      if (newSize == 0) {
+        node.contents = null; // Fully decommit when requesting a resize to zero.
+        node.usedBytes = 0;
+        return;
+      }
+
+#if USE_TYPED_ARRAYS == 2
+      if (!node.contents || node.contents.subarray) { // Resize a typed array if that is being used as the backing store.
+        var oldContents = node.contents;
+        node.contents = new Uint8Array(new ArrayBuffer(newSize)); // Allocate new storage.
+        node.contents.set(oldContents.subarray(0, Math.min(newSize, node.usedBytes))); // Copy old data over to the new storage.
+        node.usedBytes = newSize;
+        return;
+      }
+#endif
+      // Backing with a JS array.
+      if (!node.contents) node.contents = [];
+      if (node.contents.length > newSize) node.contents.length = newSize;
+      else while (node.contents.length < newSize) node.contents.push(0);
+      node.usedBytes = newSize;
+    },
+
     node_ops: {
       getattr: function(node) {
         var attr = {};
@@ -108,7 +173,7 @@ mergeInto(LibraryManager.library, {
         if (FS.isDir(node.mode)) {
           attr.size = 4096;
         } else if (FS.isFile(node.mode)) {
-          attr.size = node.contents.length;
+          attr.size = node.usedBytes;
         } else if (FS.isLink(node.mode)) {
           attr.size = node.link.length;
         } else {
@@ -131,10 +196,7 @@ mergeInto(LibraryManager.library, {
           node.timestamp = attr.timestamp;
         }
         if (attr.size !== undefined) {
-          MEMFS.ensureFlexible(node);
-          var contents = node.contents;
-          if (attr.size < contents.length) contents.length = attr.size;
-          else while (attr.size > contents.length) contents.push(0);
+          MEMFS.resizeFileStorage(node, attr.size);
         }
       },
       lookup: function(parent, name) {
@@ -198,9 +260,8 @@ mergeInto(LibraryManager.library, {
     stream_ops: {
       read: function(stream, buffer, offset, length, position) {
         var contents = stream.node.contents;
-        if (position >= contents.length)
-          return 0;
-        var size = Math.min(contents.length - position, length);
+        if (position >= stream.node.usedBytes) return 0;
+        var size = Math.min(stream.node.usedBytes - position, length);
         assert(size >= 0);
 #if USE_TYPED_ARRAYS == 2
         if (size > 8 && contents.subarray) { // non-trivial, and typed array
@@ -208,47 +269,53 @@ mergeInto(LibraryManager.library, {
         } else
 #endif
         {
-          for (var i = 0; i < size; i++) {
-            buffer[offset + i] = contents[position + i];
-          }
+          for (var i = 0; i < size; i++) buffer[offset + i] = contents[position + i];
         }
         return size;
       },
+
+      // Writes the byte range (buffer[offset], buffer[offset+length]) to offset 'position' into the file pointed by 'stream'
       write: function(stream, buffer, offset, length, position, canOwn) {
+        if (!length) return 0;
         var node = stream.node;
         node.timestamp = Date.now();
-        var contents = node.contents;
+
 #if USE_TYPED_ARRAYS == 2
-        if (length && contents.length === 0 && position === 0 && buffer.subarray) {
-          // just replace it with the new data
-#if ASSERTIONS
-          assert(buffer.length);
-#endif
-          if (canOwn && offset === 0) {
-            node.contents = buffer; // this could be a subarray of Emscripten HEAP, or allocated from some other source.
-            node.contentMode = (buffer.buffer === HEAP8.buffer) ? MEMFS.CONTENT_OWNING : MEMFS.CONTENT_FIXED;
-          } else {
-            node.contents = new Uint8Array(buffer.subarray(offset, offset+length));
-            node.contentMode = MEMFS.CONTENT_FIXED;
+        if (buffer.subarray && (!node.contents || node.contents.subarray)) { // This write is from a typed array to a typed array?
+          if (canOwn) { // Can we just reuse the buffer we are given?
+            node.contents = buffer.subarray(offset, offset + length);
+            node.usedBytes = length;
+            return length;
+          } else if (node.usedBytes === 0) { // If this first write to an empty file, do a fast set since we don't need to care about old data.
+            node.contents = new Uint8Array(buffer.subarray(offset, offset + length));
+            node.usedBytes = length;
+            return length;
+          } else if (position + length <= node.usedBytes) { // Writing to an already allocated and used subrange of the file?
+            node.contents.set(buffer.subarray(offset, offset + length), position);
+            return length;
           }
-          return length;
         }
 #endif
-        MEMFS.ensureFlexible(node);
-        var contents = node.contents;
-        while (contents.length < position) contents.push(0);
-        for (var i = 0; i < length; i++) {
-          contents[position + i] = buffer[offset + i];
-        }
+        // Appending to an existing file and we need to reallocate, or source data did not come as a typed array.
+        MEMFS.expandFileStorage(node, position+length);
+#if USE_TYPED_ARRAYS == 2
+        if (node.contents.subarray && buffer.subarray) node.contents.set(buffer.subarray(offset, offset + length), position); // Use typed array write if available.
+        else
+#endif
+          for (var i = 0; i < length; i++) {
+           node.contents[position + i] = buffer[offset + i]; // Or fall back to manual write if not.
+          }
+        node.usedBytes = Math.max(node.usedBytes, position+length);
         return length;
       },
+
       llseek: function(stream, offset, whence) {
         var position = offset;
         if (whence === 1) {  // SEEK_CUR.
           position += stream.position;
         } else if (whence === 2) {  // SEEK_END.
           if (FS.isFile(stream.node.mode)) {
-            position += stream.node.contents.length;
+            position += stream.node.usedBytes;
           }
         }
         if (position < 0) {
@@ -259,10 +326,8 @@ mergeInto(LibraryManager.library, {
         return position;
       },
       allocate: function(stream, offset, length) {
-        MEMFS.ensureFlexible(stream.node);
-        var contents = stream.node.contents;
-        var limit = offset + length;
-        while (limit > contents.length) contents.push(0);
+        MEMFS.expandFileStorage(stream.node, offset + length);
+        stream.node.usedBytes = Math.max(stream.node.usedBytes, offset + length);
       },
       mmap: function(stream, buffer, offset, length, position, prot, flags) {
         if (!FS.isFile(stream.node.mode)) {
@@ -280,7 +345,7 @@ mergeInto(LibraryManager.library, {
           ptr = contents.byteOffset;
         } else {
           // Try to avoid unnecessary slices.
-          if (position > 0 || position + length < contents.length) {
+          if (position > 0 || position + length < stream.node.usedBytes) {
             if (contents.subarray) {
               contents = contents.subarray(position, position + length);
             } else {
diff --git a/src/settings.js b/src/settings.js
index bdb149e3..7d9d1b57 100644
--- a/src/settings.js
+++ b/src/settings.js
@@ -323,6 +323,10 @@ var FS_LOG = 0; // Log all FS operations.  This is especially helpful when you'r
                 // so that you can create a virtual file system with all of the required files.
 var CASE_INSENSITIVE_FS = 0; // If set to nonzero, the provided virtual filesystem if treated case-insensitive, like
                              // Windows and OSX do. If set to 0, the VFS is case-sensitive, like on Linux.
+var MEMFS_APPEND_TO_TYPED_ARRAYS = 0; // If set to nonzero, MEMFS will always utilize typed arrays as the backing store 
+                                      // for writing to files. The default behavior is to use typed arrays for files
+                                      // when the file size doesn't change (appending writes), and for files that do
+                                      // change size, use normal JS arrays instead.
 
 var USE_BSS = 1; // https://en.wikipedia.org/wiki/.bss
                  // When enabled, 0-initialized globals are sorted to the end of the globals list,
diff --git a/tests/test_core.py b/tests/test_core.py
index bcb03830..f34ba03e 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -4030,11 +4030,14 @@ def process(filename):
     src = open(path_from_root('tests', 'files.cpp'), 'r').read()
 
     mem_file = 'src.cpp.o.js.mem'
-    try_delete(mem_file)
-    self.do_run(src, ('size: 7\ndata: 100,-56,50,25,10,77,123\nloop: 100 -56 50 25 10 77 123 \ninput:hi there!\ntexto\n$\n5 : 10,30,20,11,88\nother=some data.\nseeked=me da.\nseeked=ata.\nseeked=ta.\nfscanfed: 10 - hello\nok.\ntexte\n', 'size: 7\ndata: 100,-56,50,25,10,77,123\nloop: 100 -56 50 25 10 77 123 \ninput:hi there!\ntexto\ntexte\n$\n5 : 10,30,20,11,88\nother=some data.\nseeked=me da.\nseeked=ata.\nseeked=ta.\nfscanfed: 10 - hello\nok.\n'),
-                 post_build=post, extra_emscripten_args=['-H', 'libc/fcntl.h'])
-    if self.emcc_args and '--memory-init-file' in self.emcc_args:
-      assert os.path.exists(mem_file)
+    orig_args = self.emcc_args
+    for modes in [[], ['-s', 'MEMFS_APPEND_TO_TYPED_ARRAYS=1']]:
+      self.emcc_args = orig_args + modes
+      try_delete(mem_file)
+      self.do_run(src, ('size: 7\ndata: 100,-56,50,25,10,77,123\nloop: 100 -56 50 25 10 77 123 \ninput:hi there!\ntexto\n$\n5 : 10,30,20,11,88\nother=some data.\nseeked=me da.\nseeked=ata.\nseeked=ta.\nfscanfed: 10 - hello\nok.\ntexte\n', 'size: 7\ndata: 100,-56,50,25,10,77,123\nloop: 100 -56 50 25 10 77 123 \ninput:hi there!\ntexto\ntexte\n$\n5 : 10,30,20,11,88\nother=some data.\nseeked=me da.\nseeked=ata.\nseeked=ta.\nfscanfed: 10 - hello\nok.\n'),
+                  post_build=post, extra_emscripten_args=['-H', 'libc/fcntl.h'])
+      if self.emcc_args and '--memory-init-file' in self.emcc_args:
+        assert os.path.exists(mem_file)
 
   def test_files_m(self):
     # Test for Module.stdin etc.
@@ -4275,7 +4278,10 @@ def process(filename):
     if self.emcc_args is None: return self.skip('requires libcxx')
     test_path = path_from_root('tests', 'core', 'test_wprintf')
     src, output = (test_path + s for s in ('.c', '.out'))
-    self.do_run_from_file(src, output)
+    orig_args = self.emcc_args
+    for modes in [[], ['-s', 'MEMFS_APPEND_TO_TYPED_ARRAYS=1']]:
+      self.emcc_args = orig_args + modes
+      self.do_run_from_file(src, output)
 
   def test_direct_string_constant_usage(self):
     if self.emcc_args is None: return self.skip('requires libcxx')
@@ -5108,7 +5114,7 @@ def process(filename):
     \'\'\'
       FS.createDataFile('/', 'paper.pdf', eval(Module.read('paper.pdf.js')), true, false);
       Module.callMain(Module.arguments);
-      Module.print("Data: " + JSON.stringify(FS.root.contents['filename-1.ppm'].contents.map(function(x) { return unSign(x, 8) })));
+      Module.print("Data: " + JSON.stringify(MEMFS.getFileDataAsRegularArray(FS.root.contents['filename-1.ppm']).map(function(x) { return unSign(x, 8) })));
     \'\'\'
   )
   src.close()
@@ -5158,7 +5164,7 @@ def process(filename):
     ))
   ).replace(
     '// {{POST_RUN_ADDITIONS}}',
-    "Module.print('Data: ' + JSON.stringify(FS.analyzePath('image.raw').object.contents));"
+    "Module.print('Data: ' + JSON.stringify(MEMFS.getFileDataAsRegularArray(FS.analyzePath('image.raw').object)));"
   )
   open(filename, 'w').write(src)
 '''
author	Jukka Jylänki <jujjyl@gmail.com>	2014-06-05 02:00:08 +0300
committer	Jukka Jylänki <jujjyl@gmail.com>	2014-06-05 05:54:04 +0300
commit	92cab32f5ffcc00779b521588bec62f597c98aa7 (patch)
tree	a7c248ecf7a34dac50f326e62f8c398ec07ab2c6
parent	3431eab292e0b9e2359b183fa2c954eb7e5cf7e7 (diff)