Skip to content

More efficient translation #2968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) {
}
mp_hal_stdout_tx_str(filename);
const compressed_string_t* compressed = translate(" output:\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_hal_stdout_tx_str(decompressed);
pyexec_file(filename, exec_result);
Expand Down
4 changes: 2 additions & 2 deletions py/builtinhelp.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) {

// let the user know there may be other modules available from the filesystem
const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_print_str(MP_PYTHON_PRINTER, decompressed);
}
Expand Down Expand Up @@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) {
// print a general help message. Translate only works on single strings on one line.
const compressed_string_t* compressed =
translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n");
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG);
} else {
Expand Down
55 changes: 47 additions & 8 deletions py/makeqstrdata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""
Process raw qstr file and output qstr data with length, hash and data bytes.

This script works with Python 2.6, 2.7, 3.3 and 3.4.
This script works with Python 2.7, 3.3 and 3.4.

For documentation about the format of compressed translated strings, see
supervisor/shared/translate.h
"""

from __future__ import print_function
Expand Down Expand Up @@ -132,19 +135,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
print("//", values, lengths)
values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
with open(compression_filename, "w") as f:
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
return values, lengths

def decompress(encoding_table, length, encoded):
def decompress(encoding_table, encoded, encoded_length_bits):
values, lengths = encoding_table
#print(l, encoded)
dec = []
this_byte = 0
this_bit = 7
b = encoded[this_byte]
for i in range(length):
bits = 0
for i in range(encoded_length_bits):
bits <<= 1
if 0x80 & b:
bits |= 1

b <<= 1
if this_bit == 0:
this_bit = 7
this_byte += 1
if this_byte < len(encoded):
b = encoded[this_byte]
else:
this_bit -= 1
length = bits

i = 0
while i < length:
bits = 0
bit_length = 0
max_code = lengths[0]
Expand All @@ -170,10 +191,11 @@ def decompress(encoding_table, length, encoded):
searched_length += lengths[bit_length]

v = values[searched_length + bits - max_code]
i += len(v.encode('utf-8'))
dec.append(v)
return ''.join(dec)

def compress(encoding_table, decompressed):
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
if not isinstance(decompressed, str):
raise TypeError()
values, lengths = encoding_table
Expand All @@ -182,6 +204,19 @@ def compress(encoding_table, decompressed):
#print(lengths)
current_bit = 7
current_byte = 0

code = len_translation_encoded
bits = encoded_length_bits+1
for i in range(bits - 1, 0, -1):
if len_translation_encoded & (1 << (i - 1)):
enc[current_byte] |= 1 << current_bit
if current_bit == 0:
current_bit = 7
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
current_byte += 1
else:
current_bit -= 1

for c in decompressed:
#print()
#print("char", c, values.index(c))
Expand Down Expand Up @@ -342,14 +377,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):

total_text_size = 0
total_text_compressed_size = 0
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns)
encoded_length_bits = max_translation_encoded_length.bit_length()
for original, translation in i18ns:
translation_encoded = translation.encode("utf-8")
compressed = compress(encoding_table, translation)
compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded))
total_text_compressed_size += len(compressed)
decompressed = decompress(encoding_table, len(translation_encoded), compressed)
decompressed = decompress(encoding_table, compressed, encoded_length_bits)
assert decompressed == translation
for c in C_ESCAPES:
decompressed = decompressed.replace(c, C_ESCAPES[c])
print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed))
total_text_size += len(translation.encode("utf-8"))

print()
Expand Down Expand Up @@ -385,6 +423,7 @@ def print_qstr_enums(qstrs):

qcfgs, qstrs, i18ns = parse_input_headers(args.infiles)
if args.translation:
i18ns = sorted(i18ns)
translations = translate(args.translation, i18ns)
encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
print_qstr_data(encoding_table, qcfgs, qstrs, translations)
Expand Down
2 changes: 1 addition & 1 deletion py/moduerrno.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) {
case ENOSPC: desc = translate("No space left on device"); break;
case EROFS: desc = translate("Read-only filesystem"); break;
}
if (desc != NULL && desc->length <= len) {
if (desc != NULL && decompress_length(desc) <= len) {
decompress(desc, buf);
return buf;
}
Expand Down
6 changes: 3 additions & 3 deletions py/obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) {
assert(n % 3 == 0);
// Decompress the format strings
const compressed_string_t* traceback = translate("Traceback (most recent call last):\n");
char decompressed[traceback->length];
char decompressed[decompress_length(traceback)];
decompress(traceback, decompressed);
#if MICROPY_ENABLE_SOURCE_LINE
const compressed_string_t* frame = translate(" File \"%q\", line %d");
#else
const compressed_string_t* frame = translate(" File \"%q\"");
#endif
char decompressed_frame[frame->length];
char decompressed_frame[decompress_length(frame)];
decompress(frame, decompressed_frame);
const compressed_string_t* block_fmt = translate(", in %q\n");
char decompressed_block[block_fmt->length];
char decompressed_block[decompress_length(block_fmt)];
decompress(block_fmt, decompressed_block);

// Print the traceback
Expand Down
4 changes: 2 additions & 2 deletions py/objexcept.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com

// Try to allocate memory for the message
mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t);
size_t o_str_alloc = fmt->length + 1;
size_t o_str_alloc = decompress_length(fmt);
byte *o_str_buf = m_new_maybe(byte, o_str_alloc);

bool used_emg_buf = false;
Expand Down Expand Up @@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
// We have some memory to format the string
struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf};
mp_print_t print = {&exc_pr, exc_add_strn};
char fmt_decompressed[fmt->length];
char fmt_decompressed[decompress_length(fmt)];
decompress(fmt, fmt_decompressed);
mp_vprintf(&print, fmt_decompressed, ap);
exc_pr.buf[exc_pr.len] = '\0';
Expand Down
26 changes: 18 additions & 8 deletions supervisor/shared/translate.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
#include "supervisor/serial.h"

void serial_write_compressed(const compressed_string_t* compressed) {
char decompressed[compressed->length];
char decompressed[decompress_length(compressed)];
decompress(compressed, decompressed);
serial_write(decompressed);
}
Expand All @@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) {
}
}

uint16_t decompress_length(const compressed_string_t* compressed) {
if (compress_max_length_bits <= 8) {
return 1 + (compressed->data >> (8 - compress_max_length_bits));
} else {
return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
}
}

char* decompress(const compressed_string_t* compressed, char* decompressed) {
uint8_t this_byte = 0;
uint8_t this_bit = 7;
uint8_t b = compressed->data[this_byte];
uint8_t this_byte = compress_max_length_bits / 8;
uint8_t this_bit = 7 - compress_max_length_bits % 8;
uint8_t b = (&compressed->data)[this_byte];
uint16_t length = decompress_length(compressed);

// Stop one early because the last byte is always NULL.
for (uint16_t i = 0; i < compressed->length - 1;) {
for (uint16_t i = 0; i < length - 1;) {
uint32_t bits = 0;
uint8_t bit_length = 0;
uint32_t max_code = lengths[0];
Expand All @@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
if (this_bit == 0) {
this_bit = 7;
this_byte += 1;
b = compressed->data[this_byte]; // This may read past the end but its never used.
b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
} else {
this_bit -= 1;
}
Expand All @@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
}

decompressed[compressed->length-1] = '\0';
decompressed[length-1] = '\0';
return decompressed;
}

inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
#ifndef NO_QSTR
#define QDEF(id, str)
#define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else
#define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
#include "genhdr/qstrdefs.generated.h"
#undef TRANSLATION
#undef QDEF
Expand Down
32 changes: 30 additions & 2 deletions supervisor/shared/translate.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,41 @@

#include <stdint.h>

// The format of the compressed data is:
// - the size of the uncompressed string in UTF-8 bytes, encoded as a
// (compress_max_length_bits)-bit number. compress_max_length_bits is
// computed during dictionary generation time, and happens to be 8
// for all current platforms. However, it'll probably end up being
// 9 in some translations sometime in the future. This length excludes
// the trailing NUL, though notably decompress_length includes it.
//
// - followed by the huffman encoding of the individual UTF-16 code
// points that make up the string. The trailing "\0" is not
// represented by a huffman code, but is implied by the length.
// (building the huffman encoding on UTF-16 code points gave better
// compression than building it on UTF-8 bytes)
//
// The "data" / "tail" construct is so that the struct's last member is a
// "flexible array". However, the _only_ member is not permitted to be
// a flexible member, so we have to declare the first byte as a separte
// member of the structure.
//
// For translations where length needs 8 bits, this saves about 1.5
// bytes per string on average compared to a structure of {uint16_t,
// flexible array}, but is also future-proofed against strings with
// UTF-8 length above 256, with a savings of about 1.375 bytes per
// string.
typedef struct {
uint16_t length;
const uint8_t data[];
uint8_t data;
const uint8_t tail[];
} compressed_string_t;

// Return the compressed, translated version of a source string
// Usually, due to LTO, this is optimized into a load of a constant
// pointer.
const compressed_string_t* translate(const char* c);
void serial_write_compressed(const compressed_string_t* compressed);
char* decompress(const compressed_string_t* compressed, char* decompressed);
uint16_t decompress_length(const compressed_string_t* compressed);

#endif // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H