12
12
import re
13
13
import sys
14
14
15
+ from math import log
15
16
import collections
16
17
import gettext
17
18
import os .path
@@ -111,9 +112,10 @@ def __init__(self, words):
111
112
112
113
def iter_words (self , text ):
113
114
s = []
115
+ words = self .words
114
116
for m in self .pat .finditer (text ):
115
117
t = m .group (0 )
116
- if t in self . words :
118
+ if t in words :
117
119
if s :
118
120
yield (False , "" .join (s ))
119
121
s = []
@@ -124,33 +126,35 @@ def iter_words(self, text):
124
126
yield (False , "" .join (s ))
125
127
126
128
def iter (self , text ):
127
- s = []
128
129
for m in self .pat .finditer (text ):
129
130
yield m .group (0 )
130
131
131
132
def iter_substrings (s , minlen , maxlen ):
132
- maxlen = min (len (s ), maxlen )
133
+ len_s = len (s )
134
+ maxlen = min (len_s , maxlen )
133
135
for n in range (minlen , maxlen + 1 ):
134
- for begin in range (0 , len ( s ) - n + 1 ):
136
+ for begin in range (0 , len_s - n + 1 ):
135
137
yield s [begin : begin + n ]
136
138
137
139
def compute_huffman_coding (translations , compression_filename ):
138
140
texts = [t [1 ] for t in translations ]
139
- all_strings_concat = "" .join (texts )
140
141
words = []
142
+
143
+ start_unused = 0x80
144
+ end_unused = 0xff
141
145
max_ord = 0
142
- begin_unused = 128
143
- end_unused = 256
144
146
for text in texts :
145
147
for c in text :
146
148
ord_c = ord (c )
147
- max_ord = max (max_ord , ord_c )
148
- if 128 <= ord_c < 256 :
149
+ max_ord = max (ord_c , max_ord )
150
+ if 0x80 <= ord_c < 0xff :
149
151
end_unused = min (ord_c , end_unused )
150
- max_words = end_unused - begin_unused
151
- char_size = 1 if max_ord < 256 else 2
152
+ max_words = end_unused - 0x80
153
+
154
+ values_type = "uint16_t" if max_ord > 255 else "uint8_t"
155
+ max_words_len = 160 if max_ord > 255 else 255
152
156
153
- sum_word_len = 0
157
+ sum_len = 0
154
158
while True :
155
159
extractor = TextSplitter (words )
156
160
counter = collections .Counter ()
@@ -162,30 +166,30 @@ def compute_huffman_coding(translations, compression_filename):
162
166
163
167
scores = sorted (
164
168
(
165
- # I don't know why this works good. This could be better.
166
- (s , (len (s ) - 1 ) ** ((max (occ - 2 , 1 ) + 0.5 ) ** 0.8 ), occ )
169
+ (s , (len (s ) - 1 ) ** log (max (occ - 2 , 1 )), occ )
167
170
for (s , occ ) in counter .items ()
168
171
),
169
172
key = lambda x : x [1 ],
170
173
reverse = True ,
171
174
)
172
175
173
- w = None
176
+ word = None
174
177
for (s , score , occ ) in scores :
175
- if score < 0 :
176
- break
177
- if len (s ) > 1 :
178
- w = s
178
+ if occ < 5 :
179
+ continue
180
+ if score < 5 :
179
181
break
182
+ word = s
183
+ break
180
184
181
- if not w :
185
+ if not word :
182
186
break
183
- if len (w ) + sum_word_len > 256 :
187
+ if sum_len + len (word ) - 2 > max_words_len :
184
188
break
185
189
if len (words ) == max_words :
186
190
break
187
- words .append (w )
188
- sum_word_len += len (w )
191
+ words .append (word )
192
+ sum_len += len (word ) - 2
189
193
190
194
extractor = TextSplitter (words )
191
195
counter = collections .Counter ()
@@ -194,26 +198,26 @@ def compute_huffman_coding(translations, compression_filename):
194
198
counter [atom ] += 1
195
199
cb = huffman .codebook (counter .items ())
196
200
197
- word_start = begin_unused
201
+ word_start = start_unused
198
202
word_end = word_start + len (words ) - 1
199
203
print ("// # words" , len (words ))
200
204
print ("// words" , words )
201
205
202
206
values = []
203
207
length_count = {}
204
208
renumbered = 0
205
- last_l = None
209
+ last_length = None
206
210
canonical = {}
207
211
for atom , code in sorted (cb .items (), key = lambda x : (len (x [1 ]), x [0 ])):
208
212
values .append (atom )
209
- l = len (code )
210
- if l not in length_count :
211
- length_count [l ] = 0
212
- length_count [l ] += 1
213
- if last_l :
214
- renumbered <<= (l - last_l )
215
- canonical [atom ] = '{0:0{width}b}' .format (renumbered , width = l )
216
- #print(f"atom={repr(atom)} code={code}", file=sys.stderr)
213
+ length = len (code )
214
+ if length not in length_count :
215
+ length_count [length ] = 0
216
+ length_count [length ] += 1
217
+ if last_length :
218
+ renumbered <<= (length - last_length )
219
+ canonical [atom ] = '{0:0{width}b}' .format (renumbered , width = length )
220
+ # print(f"atom={repr(atom)} code={code}", file=sys.stderr)
217
221
if len (atom ) > 1 :
218
222
o = words .index (atom ) + 0x80
219
223
s = "" .join (C_ESCAPES .get (ch1 , ch1 ) for ch1 in atom )
@@ -222,34 +226,37 @@ def compute_huffman_coding(translations, compression_filename):
222
226
o = ord (atom )
223
227
print ("//" , o , s , counter [atom ], canonical [atom ], renumbered )
224
228
renumbered += 1
225
- last_l = l
229
+ last_length = length
226
230
lengths = bytearray ()
227
231
print ("// length count" , length_count )
228
232
229
233
for i in range (1 , max (length_count ) + 2 ):
230
234
lengths .append (length_count .get (i , 0 ))
231
235
print ("// values" , values , "lengths" , len (lengths ), lengths )
232
- maxord = max (ord (u ) for u in values if len (u ) == 1 )
233
- values_type = "uint16_t" if maxord > 255 else "uint8_t"
234
- ch_size = 1 if maxord > 255 else 2
236
+
235
237
print ("//" , values , lengths )
236
238
values = [(atom if len (atom ) == 1 else chr (0x80 + words .index (atom ))) for atom in values ]
237
239
print ("//" , values , lengths )
238
- max_translation_encoded_length = max (len (translation .encode ("utf-8" )) for original ,translation in translations )
240
+ max_translation_encoded_length = max (
241
+ len (translation .encode ("utf-8" )) for (original , translation ) in translations )
242
+
243
+ wends = list (len (w ) - 2 for w in words )
244
+ for i in range (1 , len (wends )):
245
+ wends [i ] += wends [i - 1 ]
246
+
239
247
with open (compression_filename , "w" ) as f :
240
248
f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
241
249
f .write ("const {} values[] = {{ {} }};\n " .format (values_type , ", " .join (str (ord (u )) for u in values )))
242
250
f .write ("#define compress_max_length_bits ({})\n " .format (max_translation_encoded_length .bit_length ()))
243
251
f .write ("const {} words[] = {{ {} }};\n " .format (values_type , ", " .join (str (ord (c )) for w in words for c in w )))
244
- f .write ("const uint8_t wlen [] = {{ {} }};\n " .format (", " .join (str (len ( w )) for w in words )))
252
+ f .write ("const uint8_t wends [] = {{ {} }};\n " .format (", " .join (str (p ) for p in wends )))
245
253
f .write ("#define word_start {}\n " .format (word_start ))
246
254
f .write ("#define word_end {}\n " .format (word_end ))
247
255
248
- extractor = TextSplitter (words )
249
- return values , lengths , words , extractor
256
+ return (values , lengths , words , canonical , extractor )
250
257
251
258
def decompress (encoding_table , encoded , encoded_length_bits ):
252
- values , lengths , words , extractor = encoding_table
259
+ ( values , lengths , words , _ , _ ) = encoding_table
253
260
dec = []
254
261
this_byte = 0
255
262
this_bit = 7
@@ -306,66 +313,32 @@ def decompress(encoding_table, encoded, encoded_length_bits):
306
313
def compress (encoding_table , decompressed , encoded_length_bits , len_translation_encoded ):
307
314
if not isinstance (decompressed , str ):
308
315
raise TypeError ()
309
- values , lengths , words , extractor = encoding_table
316
+ ( _ , _ , _ , canonical , extractor ) = encoding_table
310
317
311
318
enc = bytearray (len (decompressed ) * 3 )
312
- #print(decompressed)
313
- #print(lengths)
314
319
current_bit = 7
315
320
current_byte = 0
316
321
317
- code = len_translation_encoded
318
- bits = encoded_length_bits + 1
322
+ bits = encoded_length_bits + 1
319
323
for i in range (bits - 1 , 0 , - 1 ):
320
324
if len_translation_encoded & (1 << (i - 1 )):
321
325
enc [current_byte ] |= 1 << current_bit
322
326
if current_bit == 0 :
323
327
current_bit = 7
324
- #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
325
328
current_byte += 1
326
329
else :
327
330
current_bit -= 1
328
331
329
- #print("values = ", values, file=sys.stderr)
330
332
for atom in extractor .iter (decompressed ):
331
- #print("", file=sys.stderr)
332
- if len (atom ) > 1 :
333
- c = chr (0x80 + words .index (atom ))
334
- else :
335
- c = atom
336
- assert c in values
337
-
338
- start = 0
339
- end = lengths [0 ]
340
- bits = 1
341
- compressed = None
342
- code = 0
343
- while compressed is None :
344
- s = start
345
- e = end
346
- #print("{0:0{width}b}".format(code, width=bits))
347
- # Linear search!
348
- for i in range (s , e ):
349
- if values [i ] == c :
350
- compressed = code + (i - start )
351
- #print("found {0:0{width}b}".format(compressed, width=bits), file=sys.stderr)
352
- break
353
- code += end - start
354
- code <<= 1
355
- start = end
356
- end += lengths [bits ]
357
- bits += 1
358
- #print("next bit", bits)
359
-
360
- for i in range (bits - 1 , 0 , - 1 ):
361
- if compressed & (1 << (i - 1 )):
333
+ for b in canonical [atom ]:
334
+ if b == "1" :
362
335
enc [current_byte ] |= 1 << current_bit
363
336
if current_bit == 0 :
364
337
current_bit = 7
365
- #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
366
338
current_byte += 1
367
339
else :
368
340
current_bit -= 1
341
+
369
342
if current_bit != 7 :
370
343
current_byte += 1
371
344
return enc [:current_byte ]
0 commit comments