Skip to content

Commit 4068053

Browse files
committed
Implement ar-file parsing in python
This takes a different approach to the problem of duplicate files in ar archives. This makes the emar wrapper unnecessary solving the problem at ar file extraction time instead.
1 parent 5e6ceaa commit 4068053

File tree

4 files changed

+208
-172
lines changed

4 files changed

+208
-172
lines changed

emar.py

Lines changed: 5 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -6,98 +6,19 @@
66

77
"""Archive helper script
88
9-
This script acts as a frontend replacement for `ar`. See emcc.
10-
This is needed because, unlike a traditional linker, emscripten can't handle
11-
archive with duplicate member names. This is because emscripten extracts
12-
archive to a temporary location and duplicate filenames will clobber each
13-
other in this case.
9+
This script is a simple wrapper around llvm-ar. It used to have special
10+
handling for duplicate basenames in order to allow bitcode linking prcoess to
11+
read such files. This is now hanled by using tools/arfile.py to read archives.
1412
"""
1513

16-
# TODO(sbc): Implement `ar x` within emscripten, in python, to avoid this issue
17-
# and delete this file.
18-
19-
from __future__ import print_function
20-
import hashlib
21-
import os
22-
import shutil
2314
import sys
2415

25-
from tools.toolchain_profiler import ToolchainProfiler
2616
from tools import shared
27-
from tools.response_file import substitute_response_files, create_response_file
28-
29-
if __name__ == '__main__':
30-
ToolchainProfiler.record_process_start()
3117

3218

33-
#
34-
# Main run() function
35-
#
3619
def run():
37-
args = substitute_response_files(sys.argv)
38-
newargs = [shared.LLVM_AR] + args[1:]
39-
40-
to_delete = []
41-
42-
# The 3 argmuent form of ar doesn't involve other files. For example
43-
# 'ar x libfoo.a'.
44-
if len(newargs) > 3:
45-
cmd = newargs[1]
46-
if 'r' in cmd:
47-
# We are adding files to the archive.
48-
# Normally the output file is then arg 2, except in the case were the
49-
# a or b modifiers are used in which case its arg 3.
50-
if 'a' in cmd or 'b' in cmd:
51-
out_arg_index = 3
52-
else:
53-
out_arg_index = 2
54-
55-
contents = set()
56-
if os.path.exists(newargs[out_arg_index]):
57-
cmd = [shared.LLVM_AR, 't', newargs[out_arg_index]]
58-
output = shared.check_call(cmd, stdout=shared.PIPE).stdout
59-
contents.update(output.split('\n'))
60-
61-
# Add a hash to colliding basename, to make them unique.
62-
for j in range(out_arg_index + 1, len(newargs)):
63-
orig_name = newargs[j]
64-
full_name = os.path.abspath(orig_name)
65-
dirname = os.path.dirname(full_name)
66-
basename = os.path.basename(full_name)
67-
if basename not in contents:
68-
contents.add(basename)
69-
continue
70-
h = hashlib.md5(full_name.encode('utf-8')).hexdigest()[:8]
71-
parts = basename.split('.')
72-
parts[0] += '_' + h
73-
newname = '.'.join(parts)
74-
full_newname = os.path.join(dirname, newname)
75-
assert not os.path.exists(full_newname)
76-
try:
77-
shutil.copyfile(orig_name, full_newname)
78-
newargs[j] = full_newname
79-
to_delete.append(full_newname)
80-
contents.add(newname)
81-
except:
82-
# it is ok to fail here, we just don't get hashing
83-
contents.add(basename)
84-
pass
85-
86-
if shared.DEBUG:
87-
print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr)
88-
89-
response_filename = create_response_file(newargs[3:], shared.get_emscripten_temp_dir())
90-
to_delete += [response_filename]
91-
newargs = newargs[:3] + ['@' + response_filename]
92-
93-
if shared.DEBUG:
94-
print('emar:', sys.argv, ' ==> ', newargs, file=sys.stderr)
95-
96-
try:
97-
return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
98-
finally:
99-
for d in to_delete:
100-
shared.try_delete(d)
20+
newargs = [shared.LLVM_AR] + sys.argv[1:]
21+
return shared.run_process(newargs, stdin=sys.stdin, check=False).returncode
10122

10223

10324
if __name__ == '__main__':

tests/test_other.py

Lines changed: 5 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,16 +1489,10 @@ def test_archive_duplicate_basenames(self):
14891489
''')
14901490
run_process([PYTHON, EMCC, os.path.join('b', 'common.c'), '-c', '-o', os.path.join('b', 'common.o')])
14911491

1492-
try_delete('liba.a')
1493-
run_process([PYTHON, EMAR, 'rc', 'liba.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
1494-
1495-
# Verify that archive contains basenames with hashes to avoid duplication
1496-
text = run_process([PYTHON, EMAR, 't', 'liba.a'], stdout=PIPE).stdout
1497-
self.assertEqual(text.count('common.o'), 1)
1498-
self.assertContained('common_', text)
1499-
for line in text.split('\n'):
1500-
# should not have huge hash names
1501-
self.assertLess(len(line), 20, line)
1492+
try_delete('libdup.a')
1493+
run_process([PYTHON, EMAR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
1494+
text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
1495+
self.assertEqual(text.count('common.o'), 2)
15021496

15031497
create_test_file('main.c', r'''
15041498
void a(void);
@@ -1508,30 +1502,9 @@ def test_archive_duplicate_basenames(self):
15081502
b();
15091503
}
15101504
''')
1511-
err = run_process([PYTHON, EMCC, 'main.c', '-L.', '-la'], stderr=PIPE).stderr
1512-
self.assertNotIn('archive file contains duplicate entries', err)
1505+
run_process([PYTHON, EMCC, 'main.c', '-L.', '-ldup'])
15131506
self.assertContained('a\nb...\n', run_js('a.out.js'))
15141507

1515-
# Using llvm-ar directly should cause duplicate basenames
1516-
try_delete('libdup.a')
1517-
run_process([LLVM_AR, 'rc', 'libdup.a', os.path.join('a', 'common.o'), os.path.join('b', 'common.o')])
1518-
text = run_process([PYTHON, EMAR, 't', 'libdup.a'], stdout=PIPE).stdout
1519-
assert text.count('common.o') == 2, text
1520-
1521-
# With fastcomp we don't support duplicate members so this should generate
1522-
# a warning. With the wasm backend (lld) this is fully supported.
1523-
cmd = [PYTHON, EMCC, 'main.c', '-L.', '-ldup']
1524-
if self.is_wasm_backend():
1525-
run_process(cmd)
1526-
self.assertContained('a\nb...\n', run_js('a.out.js'))
1527-
else:
1528-
err = self.expect_fail(cmd)
1529-
self.assertIn('libdup.a: archive file contains duplicate entries', err)
1530-
self.assertIn('error: undefined symbol: a', err)
1531-
# others are not duplicates - the hashing keeps them separate
1532-
self.assertEqual(err.count('duplicate: '), 1)
1533-
self.assertContained('a\nb...\n', run_js('a.out.js'))
1534-
15351508
def test_export_from_archive(self):
15361509
export_name = 'this_is_an_entry_point'
15371510
full_export_name = '_' + export_name

tools/arfile.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# Copyright 2019 The Emscripten Authors. All rights reserved.
2+
# Emscripten is available under two separate licenses, the MIT license and the
3+
# University of Illinois/NCSA Open Source License. Both these licenses can be
4+
# found in the LICENSE file.
5+
6+
"""Utility functions for parsing 'ar' files.
7+
8+
This is needed in emsripten because command line tools such as llvm-ar are not
9+
able to deal with archives containing many files with the same name. Despite
10+
this linker are expected to handle this case and emscripten needs to emulate
11+
linker behaviour when using the fastcomp backend.
12+
13+
See https://en.wikipedia.org/wiki/Ar_(Unix)
14+
15+
"""
16+
17+
from __future__ import print_function
18+
19+
from __builtin__ import open as builtin_open
20+
import struct
21+
import os
22+
import sys
23+
24+
MAGIC = '!<arch>\n'
25+
26+
27+
class ArError(Exception):
28+
"""Base exception."""
29+
pass
30+
31+
32+
class ArInfo(object):
33+
def __init__(self, name, offset, timestamp, owner, group, mode, size, data):
34+
self.name = name
35+
self.offset = offset
36+
self.timestamp = timestamp
37+
self.owner = owner
38+
self.group = group
39+
self.mode = mode
40+
self.size = size
41+
self.data = data
42+
43+
44+
class ArFile(object):
45+
def __init__(self, filename):
46+
self.filename = filename
47+
self._file = builtin_open(filename, 'r')
48+
magic = self._file.read(len(MAGIC))
49+
if MAGIC != magic:
50+
raise ArError('not an ar file: ' + filename)
51+
self.members = []
52+
self.members_map = {}
53+
self.offset_to_info = {}
54+
55+
def _read_member(self):
56+
offset = self._file.tell()
57+
name = self._file.read(16)
58+
if len(name) == 0:
59+
return None
60+
name = name.strip()
61+
timestamp = self._file.read(12).strip()
62+
owner = self._file.read(6).strip()
63+
group = self._file.read(6).strip()
64+
mode = self._file.read(8).strip()
65+
size = int(self._file.read(10))
66+
ending = self._file.read(2)
67+
if ending != '\x60\n':
68+
raise ArError('invalid ar header')
69+
data = self._file.read(size)
70+
if mode.strip():
71+
mode = int(mode)
72+
if owner.strip():
73+
owner = int(owner)
74+
if group.strip():
75+
group = int(group)
76+
if size % 2:
77+
if self._file.read(1) != '\n':
78+
raise ArError('invalid ar header')
79+
80+
return ArInfo(name, offset, timestamp, owner, group, mode, size, data)
81+
82+
def next(self):
83+
while True:
84+
# Keep reading entries until we find a non-special one
85+
info = self._read_member()
86+
if not info:
87+
return None
88+
if info.name == '//':
89+
# Special file containing long filenames
90+
self.name_data = info.data
91+
elif info.name == '/':
92+
# Special file containing symbol table
93+
num_entries = struct.unpack('>I', info.data[:4])[0]
94+
self.sym_offsets = struct.unpack('>%dI' % num_entries, info.data[4:4 + 4 * num_entries])
95+
symbol_data = info.data[4 + 4 * num_entries:-1]
96+
self.symbols = symbol_data.rstrip('\0').split('\0')
97+
if len(self.symbols) != num_entries:
98+
raise ArError('invalid symbol table')
99+
else:
100+
break
101+
102+
# This entry has a name from the "//" name section.
103+
if info.name[0] == '/':
104+
name_offset = int(info.name[1:])
105+
if name_offset < 0 or name_offset >= len(self.name_data):
106+
raise ArError('invalid extended filename section')
107+
name_end = self.name_data.find('\n', name_offset)
108+
info.name = self.name_data[name_offset:name_end]
109+
info.name = info.name.rstrip('/')
110+
self.members.append(info)
111+
self.members_map[info.name] = info
112+
self.offset_to_info[info.offset] = info
113+
return info
114+
115+
def getsymbols(self):
116+
return zip(self.symbols, self.sym_offsets)
117+
118+
def getmember(self, id):
119+
"""Polymophic member accessor that takes either and index or a name."""
120+
if isinstance(id, int):
121+
return self.getmember_by_index(id)
122+
return self.getmember_by_name(id)
123+
124+
def getmember_by_name(self, name):
125+
self.getmembers()
126+
return self.members_map[name]
127+
128+
def getmember_by_index(self, index):
129+
self.getmembers()
130+
return self.members[index]
131+
132+
def getmembers(self):
133+
while self.next():
134+
pass
135+
return self.members
136+
137+
def list(self):
138+
for m in self.getmembers():
139+
sys.stdout.write(m.name + '\n')
140+
141+
def extractall(self, path="."):
142+
names_written = set()
143+
for m in self.getmembers():
144+
filename = m.name
145+
if filename in names_written:
146+
basename = filename
147+
count = 1
148+
while filename in names_written:
149+
filename = basename + '.' + str(count)
150+
count += 1
151+
152+
names_written.add(filename)
153+
full_name = os.path.join(path, filename)
154+
with builtin_open(full_name, 'w') as f:
155+
f.write(m.data)
156+
157+
return list(names_written)
158+
159+
def close(self):
160+
self._file.close()
161+
162+
def __enter__(self):
163+
return self
164+
165+
def __exit__(self, type, value, traceback):
166+
self.close()
167+
168+
169+
def open(filename):
170+
return ArFile(filename)
171+
172+
173+
def is_arfile(filename):
174+
"""Return True if name points to a ar archive that we
175+
are able to handle, else return False.
176+
"""
177+
try:
178+
t = open(filename)
179+
t.close()
180+
return True
181+
except ArError:
182+
return False

0 commit comments

Comments
 (0)