diff --git a/emar.py b/emar.py index e45852361333d..5f8db6316b65c 100755 --- a/emar.py +++ b/emar.py @@ -6,98 +6,19 @@ """Archive helper script -This script acts as a frontend replacement for `ar`. See emcc. -This is needed because, unlike a traditional linker, emscripten can't handle -archive with duplicate member names. This is because emscripten extracts -archive to a temporary location and duplicate filenames will clobber each -other in this case. +This script is a simple wrapper around llvm-ar. It used to have special +handling for duplicate basenames in order to allow bitcode linking process to +read such files. This is now handled by using tools/arfile.py to read archives. """ -# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue -# and delete this file. - -from __future__ import print_function -import hashlib -import os -import shutil import sys -from tools.toolchain_profiler import ToolchainProfiler from tools import shared -from tools.response_file import substitute_response_files, create_response_file - -if __name__ == '__main__': - ToolchainProfiler.record_process_start() -# -# Main run() function -# def run(): - args = substitute_response_files(sys.argv) - newargs = [shared.LLVM_AR] + args[1:] - - to_delete = [] - - # The 3 argmuent form of ar doesn't involve other files. For example - # 'ar x libfoo.a'. - if len(newargs) > 3: - cmd = newargs[1] - if 'r' in cmd: - # We are adding files to the archive. - # Normally the output file is then arg 2, except in the case were the - # a or b modifiers are used in which case its arg 3. - if 'a' in cmd or 'b' in cmd: - out_arg_index = 3 - else: - out_arg_index = 2 - - contents = set() - if os.path.exists(newargs[out_arg_index]): - cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]] - output = shared.check_call(cmd, stdout=shared.PIPE).stdout - contents.update(output.split('\n')) - - # Add a hash to colliding basename, to make them unique. - for j in range(out_arg_index + 1, len(newargs)): - orig_name = newargs[j] - full_name = os.path.abspath(orig_name) - dirname = os.path.dirname(full_name) - basename = os.path.basename(full_name) - if basename not in contents: - contents.add(basename) - continue - h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8] - parts = basename.split('.') - parts[0] += '_' + h - newname = '.'.join(parts) - full_newname = os.path.join(dirname, newname) - assert not os.path.exists(full_newname) - try: - shutil.copyfile(orig_name, full_newname) - newargs[j] = full_newname - to_delete.append(full_newname) - contents.add(newname) - except: - # it is ok to fail here, we just don't get hashing - contents.add(basename) - pass - - if shared.DEBUG: - print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr) - - response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir()) - to_delete += [response_filename] - newargs = newargs[:3] + ['@' + response_filename] - - if shared.DEBUG: - print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr) - - try: - return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode - finally: - for d in to_delete: - shared.try_delete(d) + newargs = [shared.LLVM_AR] + sys.argv[1:] + return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode if __name__ == '__main__': diff --git a/tests/test_core.py b/tests/test_core.py index 67eaf7d138e7b..33d08410e70bc 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -8,7 +8,6 @@ import hashlib import json import os -import random import re import shutil import sys @@ -5164,11 +5163,11 @@ def test_iostream_and_determinism(self): return 0; } ''' - num = 5 + num = 3 def test(): print('(iteration)') - time.sleep(random.random() / (10 * num)) # add some timing nondeterminism here, not that we need it, but whatever + time.sleep(1.0) self.do_run(src, 'hello world\n77.\n') ret = open('src.cpp.o.js', 'rb').read() if self.get_setting('WASM') and not self.get_setting('WASM2JS'): @@ -5176,14 +5175,13 @@ def test(): return ret builds = [test() for i in range(num)] - print(list(map(len, builds))) + print([len(b) for b in builds]) uniques = set(builds) if len(uniques) != 1: - i = 0 - for unique in uniques: + for i, unique in enumerate(uniques): open('unique_' + str(i) + '.js', 'wb').write(unique) - i += 1 - assert 0, 'builds must be deterministic, see unique_X.js' + # builds must be deterministic, see unique_N.js + self.assertEqual(len(uniques), 1) def test_stdvec(self): self.do_run_in_out_file_test('tests', 'core', 'test_stdvec') diff --git a/tests/test_other.py b/tests/test_other.py index 69304acf4d1db..9b5e879584136 100644 --- a/tests/test_other.py +++ b/tests/test_other.py @@ -1450,16 +1450,10 @@ def test_archive_duplicate_basenames(self): ''') run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')]) - try_delete('liba.a') - run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) - - # Verify that archive contains basenames with hashes to avoid duplication - text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout - self.assertEqual(text.count('common.o'), 1) - self.assertContained('common_', text) - for line in text.split('\n'): - # should not have huge hash names - self.assertLess(len(line), 20, line) + try_delete('libdup.a') + run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) + text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout + self.assertEqual(text.count('common.o'), 2) create_test_file('main.c', r''' void a(void); @@ -1469,30 +1463,9 @@ def test_archive_duplicate_basenames(self): b(); } ''') - err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr - self.assertNotIn('archive file contains duplicate entries', err) + run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup']) self.assertContained('a\nb...\n', run_js('a.out.js')) - # Using llvm-ar directly should cause duplicate basenames - try_delete('libdup.a') - run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')]) - text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout - assert text.count('common.o') == 2, text - - # With fastcomp we don't support duplicate members so this should generate - # a warning. With the wasm backend (lld) this is fully supported. - cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup'] - if self.is_wasm_backend(): - run_process(cmd) - self.assertContained('a\nb...\n', run_js('a.out.js')) - else: - err = self.expect_fail(cmd) - self.assertIn('libdup.a: archive file contains duplicate entries', err) - self.assertIn('error: undefined symbol: a', err) - # others are not duplicates - the hashing keeps them separate - self.assertEqual(err.count('duplicate: '), 1) - self.assertContained('a\nb...\n', run_js('a.out.js')) - def test_export_from_archive(self): export_name = 'this_is_an_entry_point' full_export_name = '_' + export_name diff --git a/tools/arfile.py b/tools/arfile.py new file mode 100755 index 0000000000000..ce41e3e66f391 --- /dev/null +++ b/tools/arfile.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# Copyright 2019 The Emscripten Authors. All rights reserved. +# Emscripten is available under two separate licenses, the MIT license and the +# University of Illinois/NCSA Open Source License. Both these licenses can be +# found in the LICENSE file. + +"""Utility functions for parsing 'ar' files. + +This is needed in emscripten because command line tools such as llvm-ar are not +able to deal with archives containing many files with the same name. Despite +this, linkers are expected to handle this case and emscripten needs to emulate +linker behaviour when using the fastcomp backend. + +See https://en.wikipedia.org/wiki/Ar_(Unix) +""" + +from __future__ import print_function + +import struct +import os +import sys + +MAGIC = b'!\n' +builtin_open = open + + +class ArError(Exception): + """Base exception.""" + pass + + +class ArInfo(object): + def __init__(self, name, offset, timestamp, owner, group, mode, size, data): + self.name = name + self.offset = offset + self.timestamp = timestamp + self.owner = owner + self.group = group + self.mode = mode + self.size = size + self.data = data + + +class ArFile(object): + def __init__(self, filename): + self.filename = filename + self._file = builtin_open(filename, 'rb') + magic = self._file.read(len(MAGIC)) + if MAGIC != magic: + raise ArError('not an ar file: ' + filename) + self.members = [] + self.members_map = {} + self.offset_to_info = {} + + def _read_member(self): + offset = self._file.tell() + name = self._file.read(16) + if len(name) == 0: + return None + name = name.strip() + timestamp = self._file.read(12).strip() + owner = self._file.read(6).strip() + group = self._file.read(6).strip() + mode = self._file.read(8).strip() + size = int(self._file.read(10)) + ending = self._file.read(2) + if ending != b'\x60\n': + raise ArError('invalid ar header') + data = self._file.read(size) + if mode.strip(): + mode = int(mode) + if owner.strip(): + owner = int(owner) + if group.strip(): + group = int(group) + if size % 2: + if self._file.read(1) != '\n': + raise ArError('invalid ar header') + + return ArInfo(name.decode('utf-8'), offset, timestamp, owner, group, mode, size, data) + + def next(self): + while True: + # Keep reading entries until we find a non-special one + info = self._read_member() + if not info: + return None + if info.name == '//': + # Special file containing long filenames + self.name_data = info.data + elif info.name == '/': + # Special file containing symbol table + num_entries = struct.unpack('>I', info.data[:4])[0] + self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries]) + symbol_data = info.data[4 + 4 * num_entries:-1] + symbol_data = symbol_data.rstrip(b'\0') + if symbol_data: + self.symbols = symbol_data.split(b'\0') + else: + self.symbols = [] + if len(self.symbols) != num_entries: + raise ArError('invalid symbol table') + else: + break + + # This entry has a name from the "//" name section. + if info.name[0] == '/': + name_offset = int(info.name[1:]) + if name_offset < 0 or name_offset >= len(self.name_data): + raise ArError('invalid extended filename section') + name_end = self.name_data.find(b'\n', name_offset) + info.name = self.name_data[name_offset:name_end].decode('utf-8') + info.name = info.name.rstrip('/') + self.members.append(info) + self.members_map[info.name] = info + self.offset_to_info[info.offset] = info + return info + + def getsymbols(self): + return zip(self.symbols, self.sym_offsets) + + def getmember(self, id): + """Polymophic member accessor that takes either and index or a name.""" + if isinstance(id, int): + return self.getmember_by_index(id) + return self.getmember_by_name(id) + + def getmember_by_name(self, name): + self.getmembers() + return self.members_map[name] + + def getmember_by_index(self, index): + self.getmembers() + return self.members[index] + + def getmembers(self): + while self.next(): + pass + return self.members + + def list(self): + for m in self.getmembers(): + sys.stdout.write(m.name + '\n') + + def extractall(self, path="."): + names_written = set() + for m in self.getmembers(): + filename = m.name + if filename in names_written: + basename = filename + count = 1 + while filename in names_written: + filename = basename + '.' + str(count) + count += 1 + + names_written.add(filename) + full_name = os.path.join(path, filename) + with builtin_open(full_name, 'wb') as f: + f.write(m.data) + + return sorted(list(names_written)) + + def close(self): + self._file.close() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + +def open(filename): + return ArFile(filename) + + +def is_arfile(filename): + """Return True if name points to a ar archive that we + are able to handle, else return False. + """ + try: + t = open(filename) + t.close() + return True + except ArError: + return False + + +if __name__ == '__main__': + open(sys.argv[1]).list() + open(sys.argv[1]).extractall() diff --git a/tools/shared.py b/tools/shared.py index 666f8176ef1dc..d518e815557c6 100644 --- a/tools/shared.py +++ b/tools/shared.py @@ -6,7 +6,7 @@ from __future__ import print_function from distutils.spawn import find_executable -from subprocess import PIPE, STDOUT +from subprocess import PIPE, STDOUT # noqa import atexit import base64 import difflib @@ -32,6 +32,7 @@ from .toolchain_profiler import ToolchainProfiler from .tempfiles import try_delete +from . import arfile from . import jsrun, cache, tempfiles, colored_logger from . import response_file @@ -1301,60 +1302,22 @@ def verify_settings(): verify_settings() -# llvm-ar appears to just use basenames inside archives. as a result, files with the same basename -# will trample each other when we extract them. to help warn of such situations, we warn if there -# are duplicate entries in the archive -def warn_if_duplicate_entries(archive_contents, archive_filename): - if len(archive_contents) != len(set(archive_contents)): - logger.warning('%s: archive file contains duplicate entries. This is not supported by emscripten. Only the last member with a given name will be linked in which can result in undefined symbols. You should either rename your source files, or use `emar` to create you archives which works around this issue.' % archive_filename) - warned = set() - for i in range(len(archive_contents)): - curr = archive_contents[i] - if curr not in warned and curr in archive_contents[i + 1:]: - logger.warning(' duplicate: %s' % curr) - warned.add(curr) - - # This function creates a temporary directory specified by the 'dir' field in # the returned dictionary. Caller is responsible for cleaning up those files # after done. def extract_archive_contents(archive_file): - lines = run_process([LLVM_AR, 't', archive_file], stdout=PIPE).stdout.splitlines() - # ignore empty lines - contents = [l for l in lines if len(l)] - if len(contents) == 0: - logger.debug('Archive %s appears to be empty (recommendation: link an .so instead of .a)' % archive_file) - return { - 'returncode': 0, - 'dir': None, - 'files': [] - } - - # `ar` files can only contains filenames. Just to be sure, verify that each - # file has only as filename component and is not absolute - for f in contents: - assert not os.path.dirname(f) - assert not os.path.isabs(f) - - warn_if_duplicate_entries(contents, archive_file) - # create temp dir temp_dir = tempfile.mkdtemp('_archive_contents', 'emscripten_temp_') - # extract file in temp dir - proc = run_process([LLVM_AR, 'xo', archive_file], stdout=PIPE, stderr=STDOUT, cwd=temp_dir) - abs_contents = [os.path.join(temp_dir, c) for c in contents] - - # check that all files were created - missing_contents = [x for x in abs_contents if not os.path.exists(x)] - if missing_contents: - exit_with_error('llvm-ar failed to extract file(s) ' + str(missing_contents) + ' from archive file ' + f + '! Error:' + str(proc.stdout)) + try: + with arfile.open(archive_file) as f: + contents = f.extractall(temp_dir) + except arfile.ArError as e: + logging.error(str(e)) + return archive_file, [], temp_dir, False - return { - 'returncode': proc.returncode, - 'dir': temp_dir, - 'files': abs_contents - } + abs_contents = [os.path.join(temp_dir, c) for c in contents] + return archive_file, abs_contents, temp_dir, True class ObjectFileInfo(object): @@ -1757,20 +1720,18 @@ def read_link_inputs(files): pool = Building.get_multiprocessing_pool() object_names_in_archives = pool.map(extract_archive_contents, archive_names) - def clean_temporary_archive_contents_directory(directory): + def clean_temporary_directory(directory): def clean_at_exit(): try_delete(directory) if directory: atexit.register(clean_at_exit) - for n in range(len(archive_names)): - if object_names_in_archives[n]['returncode'] != 0: - raise Exception('llvm-ar failed on archive ' + archive_names[n] + '!') - Building.ar_contents[archive_names[n]] = object_names_in_archives[n]['files'] - clean_temporary_archive_contents_directory(object_names_in_archives[n]['dir']) - - for o in object_names_in_archives: - for f in o['files']: + for name, files, tmpdir, success in object_names_in_archives: + if not success: + exit_with_error('failed to extract archive: ' + name) + Building.ar_contents[name] = files + clean_temporary_directory(tmpdir) + for f in files: if f not in Building.uninternal_nm_cache: object_names.append(f)