improve chunkify to consider previous chunkings when caching

author: Alon Zakai <alonzakai@gmail.com> 2012-11-20 20:50:20 +0100
committer: Alon Zakai <alonzakai@gmail.com> 2012-11-21 20:47:02 +0100
commit: 60a11cfaec3ad560cd7fcedf618001bd9c756b1c (patch)
tree: bd52dfb7481ebe89bb65b7f2a3bed579168d85a4 /tools/shared.py
parent: a2b241e70b7f7606c913e916cffb69514d548ade (diff)
1 files changed, 51 insertions, 3 deletions
diff --git a/tools/shared.py b/tools/shared.py
index 02d92855..e8767e40 100644
--- a/tools/shared.py
+++ b/tools/shared.py
@@ -1208,10 +1208,50 @@ class JCache:
   # Given a set of functions of form (ident, text), and a preferred chunk size,
   # generates a set of chunks for parallel processing and caching.
   # It is very important to generate similar chunks in incremental builds, in
-  # order to maximize the chance of cache hits.
+  # order to maximize the chance of cache hits. To achieve that, we save the
+  # chunking used in the previous compilation of this phase, and we try to
+  # generate the same chunks, barring big differences in function sizes that
+  # violate our chunk size guideline. If caching is not used, chunking_file
+  # should be None
   @staticmethod
-  def chunkify(funcs, chunk_size):
-    chunks = [] # bundles of functions
+  def chunkify(funcs, chunk_size, chunking_file):
+    previous_mapping = None
+    if chunking_file:
+      chunking_file = JCache.get_cachename(chunking_file)
+      if os.path.exists(chunking_file):
+        try:
+          previous_mapping = cPickle.Unpickler(open(chunking_file, 'rb')).load() # maps a function identifier to the chunk number it will be in
+        except:
+          pass
+    chunks = []
+    if previous_mapping:
+      # initialize with previous chunking
+      news = []
+      for func in funcs:
+        ident, data = func
+        if not ident in previous_mapping:
+          news.append(func)
+        else:
+          n = previous_mapping[ident]
+          while n > len(chunks): chunks.append([])
+          chunks[n].append(func)
+      # add news and adjust for new sizes
+      spilled = news
+      for chunk in chunks:
+        size = sum([len(func[1]) for func in chunk])
+        while size > 1.5*chunk_size and len(chunk) > 0:
+          spill = chunk.pop()
+          spilled.append(spill)
+          size -= len(spill[1])
+      for chunk in chunks:
+        size = sum([len(func[1]) for func in chunk])
+        while size < 0.66*chunk_size and len(spilled) > 0:
+          spill = spilled.pop()
+          chunk.append(spill)
+          size += len(spill[1])
+      chunks = filter(lambda chunk: len(chunk) > 0, chunks) # might have empty ones, eliminate them
+      funcs = spilled # we will allocate these into chunks as if they were normal inputs
+    # initialize reasonably, the rest of the funcs we need to split out
     curr = []
     for i in range(len(funcs)):
       func = funcs[i]
@@ -1223,6 +1263,14 @@ class JCache:
     if curr:
       chunks.append(curr)
       curr = None
+    if chunking_file:
+      # save new mapping info
+      new_mapping = {}
+      for i in range(len(chunks)):
+        chunk = chunks[i]
+        for ident, data in chunk:
+          new_mapping[ident] = i
+      cPickle.Pickler(open(chunking_file, 'wb')).dump(new_mapping)
     return chunks
 
 class JS:
author	Alon Zakai <alonzakai@gmail.com>	2012-11-20 20:50:20 +0100
committer	Alon Zakai <alonzakai@gmail.com>	2012-11-21 20:47:02 +0100
commit	60a11cfaec3ad560cd7fcedf618001bd9c756b1c (patch)
tree	bd52dfb7481ebe89bb65b7f2a3bed579168d85a4 /tools/shared.py
parent	a2b241e70b7f7606c913e916cffb69514d548ade (diff)