Merge pull request #4876 from juj/parallel_llvm_nm

juj · web-flow · commit 70d370296036 · 2017-02-04T01:19:59.000+02:00
parallel_llvm_nm
diff --git a/tools/cache.py b/tools/cache.py
@@ -36,7 +36,7 @@ def try_remove_ending(thestring, ending):
 
   def acquire_cache_lock(self):
     if not self.EM_EXCLUSIVE_CACHE_ACCESS:
-      logging.debug('Cache: acquiring multiprocess file lock to Emscripten cache')
+      logging.debug('Cache: PID %s acquiring multiprocess file lock to Emscripten cache' % str(os.getpid()))
       try:
         self.filelock.acquire(60)
       except filelock.Timeout:
@@ -54,7 +54,7 @@ def release_cache_lock(self):
       if self.prev_EM_EXCLUSIVE_CACHE_ACCESS: os.environ['EM_EXCLUSIVE_CACHE_ACCESS'] = self.prev_EM_EXCLUSIVE_CACHE_ACCESS
       else: del os.environ['EM_EXCLUSIVE_CACHE_ACCESS']
       self.filelock.release()
-      logging.debug('Cache: released multiprocess file lock to Emscripten cache')
+      logging.debug('Cache: PID %s released multiprocess file lock to Emscripten cache' % str(os.getpid()))
 
   def ensure(self):
     self.acquire_cache_lock()
diff --git a/tools/duplicate_function_eliminator.py b/tools/duplicate_function_eliminator.py
@@ -247,16 +247,8 @@ def write_chunk(chunk, i):
     if len(chunks) > 1 and cores >= 2:
       # We can parallelize
       if DEBUG: print >> sys.stderr, 'splitting up js optimization into %d chunks, using %d cores  (total: %.2f MB)' % (len(chunks), cores, total_size/(1024*1024.))
-      pool = multiprocessing.Pool(processes=cores)
+      pool = shared.Building.get_multiprocessing_pool()
       filenames = pool.map(run_on_chunk, commands, chunksize=1)
-      try:
-        # Shut down the pool, since otherwise processes are left alive and would only be lazily terminated,
-        # and in other parts of the toolchain we also build up multiprocessing pools.
-        pool.terminate()
-        pool.join()
-      except Exception, e:
-        # On Windows we get occassional "Access is denied" errors when attempting to tear down the pool, ignore these.
-        logging.debug('Attempting to tear down multiprocessing pool failed with an exception: ' + str(e))
     else:
       # We can't parallize, but still break into chunks to avoid uglify/node memory issues
       if len(chunks) > 1 and DEBUG: print >> sys.stderr, 'splitting up js optimization into %d chunks' % (len(chunks))
diff --git a/tools/js_optimizer.py b/tools/js_optimizer.py
@@ -447,16 +447,9 @@ def write_chunk(chunk, i):
       if len(chunks) > 1 and cores >= 2:
         # We can parallelize
         if DEBUG: print >> sys.stderr, 'splitting up js optimization into %d chunks, using %d cores  (total: %.2f MB)' % (len(chunks), cores, total_size/(1024*1024.))
-        pool = multiprocessing.Pool(processes=cores)
-        filenames = pool.map(run_on_chunk, commands, chunksize=1)
-        try:
-          # Shut down the pool, since otherwise processes are left alive and would only be lazily terminated,
-          # and in other parts of the toolchain we also build up multiprocessing pools.
-          pool.terminate()
-          pool.join()
-        except Exception, e:
-          # On Windows we get occassional "Access is denied" errors when attempting to tear down the pool, ignore these.
-          logging.debug('Attempting to tear down multiprocessing pool failed with an exception: ' + str(e))
+        with ToolchainProfiler.profile_block('optimizer_pool'):
+          pool = shared.Building.get_multiprocessing_pool()
+          filenames = pool.map(run_on_chunk, commands, chunksize=1)
       else:
         # We can't parallize, but still break into chunks to avoid uglify/node memory issues
         if len(chunks) > 1 and DEBUG: print >> sys.stderr, 'splitting up js optimization into %d chunks' % (len(chunks))
diff --git a/tools/shared.py b/tools/shared.py
@@ -895,10 +895,10 @@ def get_vanilla_file():
       logging.debug('failed to use vanilla file, will re-check: ' + str(e))
       is_vanilla = check_vanilla()
     temp_cache = None
+    os.environ['EMCC_WASM_BACKEND'] = str(is_vanilla)
     if is_vanilla:
       logging.debug('check tells us to use wasm backend')
       LLVM_TARGET = WASM_TARGET
-      os.environ['EMCC_WASM_BACKEND'] = '1'
     else:
       logging.debug('check tells us to use asm.js backend')
       LLVM_TARGET = ASM_JS_TARGET
@@ -1168,13 +1168,101 @@ def __setattr__(self, attr, value):
 class Settings(object):
   __metaclass__ = Settings2
 
+# llvm-ar appears to just use basenames inside archives. as a result, files with the same basename
+# will trample each other when we extract them. to help warn of such situations, we warn if there
+# are duplicate entries in the archive
+def warn_if_duplicate_entries(archive_contents, archive_filename_hint=''):
+  if len(archive_contents) != len(set(archive_contents)):
+    logging.warning('loading from archive %s, which has duplicate entries (files with identical base names). this is dangerous as only the last will be taken into account, and you may see surprising undefined symbols later. you should rename source files to avoid this problem (or avoid .a archives, and just link bitcode together to form libraries for later linking)' % archive_filename_hint)
+    warned = set()
+    for i in range(len(archive_contents)):
+      curr = archive_contents[i]
+      if curr not in warned and curr in archive_contents[i+1:]:
+        logging.warning('   duplicate: %s' % curr)
+        warned.add(curr)
+
+def extract_archive_contents(f):
+  cwd = os.getcwd()
+  try:
+    temp_dir = os.path.join(tempfile.gettempdir(), f.replace('/', '_').replace('\\', '_').replace(':', '_') + '.archive_contents') # TODO: Make sure this is nice and sane
+    safe_ensure_dirs(temp_dir)
+    os.chdir(temp_dir)
+    contents = filter(lambda x: len(x) > 0, Popen([LLVM_AR, 't', f], stdout=PIPE).communicate()[0].split('\n'))
+    warn_if_duplicate_entries(contents, f)
+    if len(contents) == 0:
+      logging.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % f)
+      return {
+        'dir': temp_dir,
+        'files': []
+      }
+
+    # We are about to ask llvm-ar to extract all the files in the .a archive file, but
+    # it will silently fail if the directory for the file does not exist, so make all the necessary directories
+    for content in contents:
+      dirname = os.path.dirname(content)
+      if dirname:
+        safe_ensure_dirs(dirname)
+    Popen([LLVM_AR, 'xo', f], stdout=PIPE).communicate() # if absolute paths, files will appear there. otherwise, in this directory
+    contents = map(lambda content: os.path.join(temp_dir, content), contents)
+    contents = filter(os.path.exists, map(os.path.abspath, contents))
+    contents = filter(Building.is_bitcode, contents)
+    return {
+      'dir': temp_dir,
+      'files': contents
+    }
+  finally:
+    os.chdir(cwd)
+
+class ObjectFileInfo:
+  def __init__(self, defs, undefs, commons):
+    self.defs = defs
+    self.undefs = undefs
+    self.commons = commons
+
+# Due to a python pickling issue, the following two functions must be at top level, or multiprocessing pool spawn won't find them.
+
+def g_llvm_nm_uncached(filename):
+  return Building.llvm_nm_uncached(filename, stdout=PIPE, stderr=None)
+
+def g_multiprocessing_initializer(*args):
+  for item in args:
+    (key, value) = item.split('=')
+    os.environ[key] = value
+
 # Building
 
 class Building:
   COMPILER = CLANG
   LLVM_OPTS = False
   COMPILER_TEST_OPTS = [] # For use of the test runner
   JS_ENGINE_OVERRIDE = None # Used to pass the JS engine override from runner.py -> test_benchmark.py
+  multiprocessing_pool = None
+
+  # Multiprocessing pools are very slow to build up and tear down, and having several pools throughout
+  # the application has a problem of overallocating child processes. Therefore maintain a single
+  # centralized pool that is shared between all pooled task invocations.
+  @staticmethod
+  def get_multiprocessing_pool():
+    if not Building.multiprocessing_pool:
+      cores = int(os.environ.get('EMCC_CORES') or multiprocessing.cpu_count())
+
+      # If running with one core only, create a mock instance of a pool that does not
+      # actually spawn any new subprocesses. Very useful for internal debugging.
+      if cores == 1:
+        class FakeMultiprocessor:
+          def map(self, func, tasks):
+            results = []
+            for t in tasks:
+              results += [func(t)]
+            return results
+        Building.multiprocessing_pool = FakeMultiprocessor()
+      else:
+        child_env = [
+          'EMCC_WASM_BACKEND='+os.environ['EMCC_WASM_BACKEND'], # Multiprocessing pool children avoid all calling check_vanilla() again and again
+          'EMCC_CORES=1' # Multiprocessing pool children can't spawn their own linear number of children, that could cause a quadratic amount of spawned processes.
+        ]
+        Building.multiprocessing_pool = multiprocessing.Pool(processes=cores, initializer=g_multiprocessing_initializer, initargs=child_env)
+    return Building.multiprocessing_pool
 
   @staticmethod
   def get_building_env(native=False):
@@ -1419,6 +1507,58 @@ def open_make_err(i, mode='r'):
       os.chdir(old_dir)
     return generated_libs
 
+  @staticmethod
+  def make_paths_absolute(f):
+    if f.startswith('-'): # skip flags
+      return f
+    else:
+      return os.path.abspath(f)
+
+  # Runs llvm-nm in parallel for the given list of files.
+  # The results are populated in Building.uninternal_nm_cache
+  # multiprocessing_pool: An existing multiprocessing pool to reuse for the operation, or None
+  # to have the function allocate its own.
+  @staticmethod
+  def parallel_llvm_nm(files):
+    with ToolchainProfiler.profile_block('parallel_llvm_nm'):
+      pool = Building.get_multiprocessing_pool()
+      object_contents = pool.map(g_llvm_nm_uncached, files)
+
+      for i in range(len(files)):
+        Building.uninternal_nm_cache[files[i]] = object_contents[i]
+      return object_contents
+
+  @staticmethod
+  def read_link_inputs(files):
+    with ToolchainProfiler.profile_block('read_link_inputs'):
+      # Before performing the link, we need to look at each input file to determine which symbols
+      # each of them provides. Do this in multiple parallel processes.
+      archive_names = [] # .a files passed in to the command line to the link
+      object_names = [] # .o/.bc files passed in to the command line to the link
+      for f in files:
+        absolute_path_f = Building.make_paths_absolute(f)
+
+        if not absolute_path_f in Building.ar_contents and Building.is_ar(absolute_path_f):
+          archive_names.append(absolute_path_f)
+        elif not absolute_path_f in Building.uninternal_nm_cache and Building.is_bitcode(absolute_path_f):
+          object_names.append(absolute_path_f)
+
+      # Archives contain objects, so process all archives first in parallel to obtain the object files in them.
+      pool = Building.get_multiprocessing_pool()
+      object_names_in_archives = pool.map(extract_archive_contents, archive_names)
+
+      for n in range(len(archive_names)):
+        Building.ar_contents[archive_names[n]] = object_names_in_archives[n]['files']
+
+      for o in object_names_in_archives:
+        for f in o['files']:
+          if not f in Building.uninternal_nm_cache:
+            object_names.append(f)
+
+      # Next, extract symbols from all object files (either standalone or inside archives we just extracted)
+      # The results are not used here directly, but populated to llvm-nm cache structure.
+      Building.parallel_llvm_nm(object_names)
+
   @staticmethod
   def link(files, target, force_archive_contents=False, temp_files=None, just_calculate=False):
     if not temp_files:
@@ -1429,19 +1569,12 @@ def link(files, target, force_archive_contents=False, temp_files=None, just_calc
     # For a simple application, this would just be "main".
     unresolved_symbols = set([func[1:] for func in Settings.EXPORTED_FUNCTIONS])
     resolved_symbols = set()
-    def make_paths_absolute(f):
-      if f.startswith('-'): # skip flags
-        return f
-      else:
-        return os.path.abspath(f)
     # Paths of already included object files from archives.
     added_contents = set()
-    # Map of archive name to list of extracted object file paths.
-    ar_contents = {}
     has_ar = False
     for f in files:
       if not f.startswith('-'):
-        has_ar = has_ar or Building.is_ar(make_paths_absolute(f))
+        has_ar = has_ar or Building.is_ar(Building.make_paths_absolute(f))
 
     # If we have only one archive or the force_archive_contents flag is set,
     # then we will add every object file we see, regardless of whether it
@@ -1467,51 +1600,14 @@ def consider_object(f, force_add=False):
         actual_files.append(f)
       return do_add
 
-    def get_archive_contents(f):
-      if f in ar_contents:
-        return ar_contents[f]
-
-      cwd = os.getcwd()
-      try:
-        temp_dir = temp_files.get_dir()
-        os.chdir(temp_dir)
-        contents = filter(lambda x: len(x) > 0, Popen([LLVM_AR, 't', f], stdout=PIPE).communicate()[0].split('\n'))
-        # llvm-ar appears to just use basenames inside archives. as a result, files with the same basename
-        # will trample each other when we extract them. to help warn of such situations, we warn if there
-        # are duplicate entries in the archive
-        if len(contents) != len(set(contents)):
-          logging.warning('loading from archive %s, which has duplicate entries (files with identical base names). this is dangerous as only the last will be taken into account, and you may see surprising undefined symbols later. you should rename source files to avoid this problem (or avoid .a archives, and just link bitcode together to form libraries for later linking)' % f)
-          warned = set()
-          for i in range(len(contents)):
-            curr = contents[i]
-            if curr not in warned and curr in contents[i+1:]:
-              logging.warning('   duplicate: %s' % curr)
-              warned.add(curr)
-        if len(contents) == 0:
-          logging.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % f)
-        else:
-          for content in contents: # ar will silently fail if the directory for the file does not exist, so make all the necessary directories
-            dirname = os.path.dirname(content)
-            if dirname:
-              safe_ensure_dirs(dirname)
-          Popen([LLVM_AR, 'x', f], stdout=PIPE).communicate() # if absolute paths, files will appear there. otherwise, in this directory
-          contents = map(lambda content: os.path.join(temp_dir, content), contents)
-          contents = filter(os.path.exists, map(os.path.abspath, contents))
-          contents = filter(Building.is_bitcode, contents)
-        ar_contents[f] = contents
-      finally:
-        os.chdir(cwd)
-
-      return contents
-
     # Traverse a single archive. The object files are repeatedly scanned for
     # newly satisfied symbols until no new symbols are found. Returns true if
     # any object files were added to the link.
     def consider_archive(f):
       added_any_objects = False
       loop_again = True
       logging.debug('considering archive %s' % (f))
-      contents = get_archive_contents(f)
+      contents = Building.ar_contents[f]
       while loop_again: # repeatedly traverse until we have everything we need
         loop_again = False
         for content in contents:
@@ -1524,9 +1620,11 @@ def consider_archive(f):
       logging.debug('done running loop of archive %s' % (f))
       return added_any_objects
 
+    Building.read_link_inputs(filter(lambda x: not x.startswith('-'), files))
+
     current_archive_group = None
     for f in files:
-      absolute_path_f = make_paths_absolute(f)
+      absolute_path_f = Building.make_paths_absolute(f)
       if f.startswith('-'):
         if f in ['--start-group', '-(']:
           assert current_archive_group is None, 'Nested --start-group, missing --end-group?'
@@ -1672,10 +1770,9 @@ def llvm_as(input_filename, output_filename=None):
 
   @staticmethod
   def parse_symbols(output, include_internal=False):
-    class ret:
-      defs = []
-      undefs = []
-      commons = []
+    defs = []
+    undefs = []
+    commons = []
     for line in output.split('\n'):
       if len(line) == 0: continue
       if ':' in line: continue # e.g.  filename.o:  , saying which file it's from
@@ -1686,30 +1783,37 @@ class ret:
       if len(parts) == 2: # ignore lines with absolute offsets, these are not bitcode anyhow (e.g. |00000630 t d_source_name|)
         status, symbol = parts
         if status == 'U':
-          ret.undefs.append(symbol)
+          undefs.append(symbol)
         elif status == 'C':
-          ret.commons.append(symbol)
+          commons.append(symbol)
         elif (not include_internal and status == status.upper()) or \
              (    include_internal and status in ['W', 't', 'T', 'd', 'D']): # FIXME: using WTD in the previous line fails due to llvm-nm behavior on OS X,
                                                                              #        so for now we assume all uppercase are normally defined external symbols
-          ret.defs.append(symbol)
-    ret.defs = set(ret.defs)
-    ret.undefs = set(ret.undefs)
-    ret.commons = set(ret.commons)
-    return ret
+          defs.append(symbol)
+    return ObjectFileInfo(set(defs), set(undefs), set(commons))
 
-  nm_cache = {} # cache results of nm - it can be slow to run
+  internal_nm_cache = {} # cache results of nm - it can be slow to run
+  uninternal_nm_cache = {}
+  ar_contents = {} # Stores the object files contained in different archive files passed as input
 
   @staticmethod
-  def llvm_nm(filename, stdout=PIPE, stderr=None, include_internal=False):
-    if filename in Building.nm_cache:
-      #logging.debug('loading nm results for %s from cache' % filename)
-      return Building.nm_cache[filename]
-
+  def llvm_nm_uncached(filename, stdout=PIPE, stderr=None, include_internal=False):
     # LLVM binary ==> list of symbols
     output = Popen([LLVM_NM, filename], stdout=stdout, stderr=stderr).communicate()[0]
-    ret = Building.parse_symbols(output, include_internal)
-    Building.nm_cache[filename] = ret
+    return Building.parse_symbols(output, include_internal)
+
+  @staticmethod
+  def llvm_nm(filename, stdout=PIPE, stderr=None, include_internal=False):
+    if include_internal and filename in Building.internal_nm_cache:
+      return Building.internal_nm_cache[filename]
+    elif not include_internal and filename in Building.uninternal_nm_cache:
+      return Building.uninternal_nm_cache[filename]
+
+    ret = Building.llvm_nm_uncached(filename, stdout, stderr, include_internal)
+
+    if include_internal: Building.internal_nm_cache[filename] = ret
+    else: Building.uninternal_nm_cache[filename] = ret
+
     return ret
 
   @staticmethod
diff --git a/tools/system_libs.py b/tools/system_libs.py