Skip to content

Commit 9abfc51

Browse files
authored
Merge pull request #3 from ciscorn/dict-comp
Small improvements to the dictionary compression
2 parents 15964a4 + d18d79a commit 9abfc51

File tree

2 files changed

+60
-87
lines changed

2 files changed

+60
-87
lines changed

py/makeqstrdata.py

+54-81
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import re
1313
import sys
1414

15+
from math import log
1516
import collections
1617
import gettext
1718
import os.path
@@ -111,9 +112,10 @@ def __init__(self, words):
111112

112113
def iter_words(self, text):
113114
s = []
115+
words = self.words
114116
for m in self.pat.finditer(text):
115117
t = m.group(0)
116-
if t in self.words:
118+
if t in words:
117119
if s:
118120
yield (False, "".join(s))
119121
s = []
@@ -124,33 +126,35 @@ def iter_words(self, text):
124126
yield (False, "".join(s))
125127

126128
def iter(self, text):
127-
s = []
128129
for m in self.pat.finditer(text):
129130
yield m.group(0)
130131

131132
def iter_substrings(s, minlen, maxlen):
132-
maxlen = min(len(s), maxlen)
133+
len_s = len(s)
134+
maxlen = min(len_s, maxlen)
133135
for n in range(minlen, maxlen + 1):
134-
for begin in range(0, len(s) - n + 1):
136+
for begin in range(0, len_s - n + 1):
135137
yield s[begin : begin + n]
136138

137139
def compute_huffman_coding(translations, compression_filename):
138140
texts = [t[1] for t in translations]
139-
all_strings_concat = "".join(texts)
140141
words = []
142+
143+
start_unused = 0x80
144+
end_unused = 0xff
141145
max_ord = 0
142-
begin_unused = 128
143-
end_unused = 256
144146
for text in texts:
145147
for c in text:
146148
ord_c = ord(c)
147-
max_ord = max(max_ord, ord_c)
148-
if 128 <= ord_c < 256:
149+
max_ord = max(ord_c, max_ord)
150+
if 0x80 <= ord_c < 0xff:
149151
end_unused = min(ord_c, end_unused)
150-
max_words = end_unused - begin_unused
151-
char_size = 1 if max_ord < 256 else 2
152+
max_words = end_unused - 0x80
153+
154+
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
155+
max_words_len = 160 if max_ord > 255 else 255
152156

153-
sum_word_len = 0
157+
sum_len = 0
154158
while True:
155159
extractor = TextSplitter(words)
156160
counter = collections.Counter()
@@ -162,30 +166,30 @@ def compute_huffman_coding(translations, compression_filename):
162166

163167
scores = sorted(
164168
(
165-
# I don't know why this works good. This could be better.
166-
(s, (len(s) - 1) ** ((max(occ - 2, 1) + 0.5) ** 0.8), occ)
169+
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
167170
for (s, occ) in counter.items()
168171
),
169172
key=lambda x: x[1],
170173
reverse=True,
171174
)
172175

173-
w = None
176+
word = None
174177
for (s, score, occ) in scores:
175-
if score < 0:
176-
break
177-
if len(s) > 1:
178-
w = s
178+
if occ < 5:
179+
continue
180+
if score < 5:
179181
break
182+
word = s
183+
break
180184

181-
if not w:
185+
if not word:
182186
break
183-
if len(w) + sum_word_len > 256:
187+
if sum_len + len(word) - 2 > max_words_len:
184188
break
185189
if len(words) == max_words:
186190
break
187-
words.append(w)
188-
sum_word_len += len(w)
191+
words.append(word)
192+
sum_len += len(word) - 2
189193

190194
extractor = TextSplitter(words)
191195
counter = collections.Counter()
@@ -194,26 +198,26 @@ def compute_huffman_coding(translations, compression_filename):
194198
counter[atom] += 1
195199
cb = huffman.codebook(counter.items())
196200

197-
word_start = begin_unused
201+
word_start = start_unused
198202
word_end = word_start + len(words) - 1
199203
print("// # words", len(words))
200204
print("// words", words)
201205

202206
values = []
203207
length_count = {}
204208
renumbered = 0
205-
last_l = None
209+
last_length = None
206210
canonical = {}
207211
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
208212
values.append(atom)
209-
l = len(code)
210-
if l not in length_count:
211-
length_count[l] = 0
212-
length_count[l] += 1
213-
if last_l:
214-
renumbered <<= (l - last_l)
215-
canonical[atom] = '{0:0{width}b}'.format(renumbered, width=l)
216-
#print(f"atom={repr(atom)} code={code}", file=sys.stderr)
213+
length = len(code)
214+
if length not in length_count:
215+
length_count[length] = 0
216+
length_count[length] += 1
217+
if last_length:
218+
renumbered <<= (length - last_length)
219+
canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length)
220+
# print(f"atom={repr(atom)} code={code}", file=sys.stderr)
217221
if len(atom) > 1:
218222
o = words.index(atom) + 0x80
219223
s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom)
@@ -222,34 +226,37 @@ def compute_huffman_coding(translations, compression_filename):
222226
o = ord(atom)
223227
print("//", o, s, counter[atom], canonical[atom], renumbered)
224228
renumbered += 1
225-
last_l = l
229+
last_length = length
226230
lengths = bytearray()
227231
print("// length count", length_count)
228232

229233
for i in range(1, max(length_count) + 2):
230234
lengths.append(length_count.get(i, 0))
231235
print("// values", values, "lengths", len(lengths), lengths)
232-
maxord = max(ord(u) for u in values if len(u) == 1)
233-
values_type = "uint16_t" if maxord > 255 else "uint8_t"
234-
ch_size = 1 if maxord > 255 else 2
236+
235237
print("//", values, lengths)
236238
values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values]
237239
print("//", values, lengths)
238-
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
240+
max_translation_encoded_length = max(
241+
len(translation.encode("utf-8")) for (original, translation) in translations)
242+
243+
wends = list(len(w) - 2 for w in words)
244+
for i in range(1, len(wends)):
245+
wends[i] += wends[i - 1]
246+
239247
with open(compression_filename, "w") as f:
240248
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
241249
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
242250
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
243251
f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w)))
244-
f.write("const uint8_t wlen[] = {{ {} }};\n".format(", ".join(str(len(w)) for w in words)))
252+
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
245253
f.write("#define word_start {}\n".format(word_start))
246254
f.write("#define word_end {}\n".format(word_end))
247255

248-
extractor = TextSplitter(words)
249-
return values, lengths, words, extractor
256+
return (values, lengths, words, canonical, extractor)
250257

251258
def decompress(encoding_table, encoded, encoded_length_bits):
252-
values, lengths, words, extractor = encoding_table
259+
(values, lengths, words, _, _) = encoding_table
253260
dec = []
254261
this_byte = 0
255262
this_bit = 7
@@ -306,66 +313,32 @@ def decompress(encoding_table, encoded, encoded_length_bits):
306313
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
307314
if not isinstance(decompressed, str):
308315
raise TypeError()
309-
values, lengths, words, extractor = encoding_table
316+
(_, _, _, canonical, extractor) = encoding_table
310317

311318
enc = bytearray(len(decompressed) * 3)
312-
#print(decompressed)
313-
#print(lengths)
314319
current_bit = 7
315320
current_byte = 0
316321

317-
code = len_translation_encoded
318-
bits = encoded_length_bits+1
322+
bits = encoded_length_bits + 1
319323
for i in range(bits - 1, 0, -1):
320324
if len_translation_encoded & (1 << (i - 1)):
321325
enc[current_byte] |= 1 << current_bit
322326
if current_bit == 0:
323327
current_bit = 7
324-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
325328
current_byte += 1
326329
else:
327330
current_bit -= 1
328331

329-
#print("values = ", values, file=sys.stderr)
330332
for atom in extractor.iter(decompressed):
331-
#print("", file=sys.stderr)
332-
if len(atom) > 1:
333-
c = chr(0x80 + words.index(atom))
334-
else:
335-
c = atom
336-
assert c in values
337-
338-
start = 0
339-
end = lengths[0]
340-
bits = 1
341-
compressed = None
342-
code = 0
343-
while compressed is None:
344-
s = start
345-
e = end
346-
#print("{0:0{width}b}".format(code, width=bits))
347-
# Linear search!
348-
for i in range(s, e):
349-
if values[i] == c:
350-
compressed = code + (i - start)
351-
#print("found {0:0{width}b}".format(compressed, width=bits), file=sys.stderr)
352-
break
353-
code += end - start
354-
code <<= 1
355-
start = end
356-
end += lengths[bits]
357-
bits += 1
358-
#print("next bit", bits)
359-
360-
for i in range(bits - 1, 0, -1):
361-
if compressed & (1 << (i - 1)):
333+
for b in canonical[atom]:
334+
if b == "1":
362335
enc[current_byte] |= 1 << current_bit
363336
if current_bit == 0:
364337
current_bit = 7
365-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
366338
current_byte += 1
367339
else:
368340
current_bit -= 1
341+
369342
if current_bit != 7:
370343
current_byte += 1
371344
return enc[:current_byte]

supervisor/shared/translate.c

+6-6
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,17 @@ STATIC int put_utf8(char *buf, int u) {
4848
*buf = u;
4949
return 1;
5050
} else if(word_start <= u && u <= word_end) {
51-
int n = (u - 0x80);
52-
size_t off = 0;
53-
for(int i=0; i<n; i++) {
54-
off += wlen[i];
51+
uint n = (u - word_start);
52+
size_t pos = 0;
53+
if (n > 0) {
54+
pos = wends[n - 1] + (n * 2);
5555
}
5656
int ret = 0;
5757
// note that at present, entries in the words table are
5858
// guaranteed not to represent words themselves, so this adds
5959
// at most 1 level of recursive call
60-
for(int i=0; i<wlen[n]; i++) {
61-
int len = put_utf8(buf, words[off+i]);
60+
for(; pos < wends[n] + (n + 1) * 2; pos++) {
61+
int len = put_utf8(buf, words[pos]);
6262
buf += len;
6363
ret += len;
6464
}

0 commit comments

Comments
 (0)