adafruit · dhalbert · May 29, 2020 · May 28, 2020 · May 28, 2020
diff --git a/main.c b/main.c
@@ -185,7 +185,7 @@ bool maybe_run_list(const char ** filenames, pyexec_result_t* exec_result) {
     }
     mp_hal_stdout_tx_str(filename);
     const compressed_string_t* compressed = translate(" output:\n");
-    char decompressed[compressed->length];
+    char decompressed[decompress_length(compressed)];
     decompress(compressed, decompressed);
     mp_hal_stdout_tx_str(decompressed);
     pyexec_file(filename, exec_result);

diff --git a/py/builtinhelp.c b/py/builtinhelp.c
@@ -135,7 +135,7 @@ STATIC void mp_help_print_modules(void) {
 
     // let the user know there may be other modules available from the filesystem
     const compressed_string_t* compressed = translate("Plus any modules on the filesystem\n");
-    char decompressed[compressed->length];
+    char decompressed[decompress_length(compressed)];
     decompress(compressed, decompressed);
     mp_print_str(MP_PYTHON_PRINTER, decompressed);
 }
@@ -181,7 +181,7 @@ STATIC mp_obj_t mp_builtin_help(size_t n_args, const mp_obj_t *args) {
         // print a general help message. Translate only works on single strings on one line.
         const compressed_string_t* compressed =
             translate("Welcome to Adafruit CircuitPython %s!\n\nPlease visit learn.adafruit.com/category/circuitpython for project guides.\n\nTo list built-in modules please do `help(\"modules\")`.\n");
-        char decompressed[compressed->length];
+        char decompressed[decompress_length(compressed)];
         decompress(compressed, decompressed);
         mp_printf(MP_PYTHON_PRINTER, decompressed, MICROPY_GIT_TAG);
     } else {

diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -1,7 +1,10 @@
 """
 Process raw qstr file and output qstr data with length, hash and data bytes.
 
-This script works with Python 2.6, 2.7, 3.3 and 3.4.
+This script works with Python 2.7, 3.3 and 3.4.
+
+For documentation about the format of compressed translated strings, see
+supervisor/shared/translate.h
 """
 
 from __future__ import print_function
@@ -132,19 +135,37 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
     print("// estimated total memory size", len(lengths) + 2*len(values) + sum(len(cb[u]) for u in all_strings_concat))
     print("//", values, lengths)
     values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t"
+    max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
     with open(compression_filename, "w") as f:
         f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
         f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
+        f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
     return values, lengths
 
-def decompress(encoding_table, length, encoded):
+def decompress(encoding_table, encoded, encoded_length_bits):
     values, lengths = encoding_table
-    #print(l, encoded)
     dec = []
     this_byte = 0
     this_bit = 7
     b = encoded[this_byte]
-    for i in range(length):
+    bits = 0
+    for i in range(encoded_length_bits):
+        bits <<= 1
+        if 0x80 & b:
+            bits |= 1
+
+        b <<= 1
+        if this_bit == 0:
+            this_bit = 7
+            this_byte += 1
+            if this_byte < len(encoded):
+                b = encoded[this_byte]
+        else:
+            this_bit -= 1
+    length = bits
+
+    i = 0
+    while i < length:
         bits = 0
         bit_length = 0
         max_code = lengths[0]
@@ -170,10 +191,11 @@ def decompress(encoding_table, length, encoded):
             searched_length += lengths[bit_length]
 
         v = values[searched_length + bits - max_code]
+        i += len(v.encode('utf-8'))
         dec.append(v)
     return ''.join(dec)
 
-def compress(encoding_table, decompressed):
+def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
     if not isinstance(decompressed, str):
         raise TypeError()
     values, lengths = encoding_table
@@ -182,6 +204,19 @@ def compress(encoding_table, decompressed):
     #print(lengths)
     current_bit = 7
     current_byte = 0
+
+    code = len_translation_encoded
+    bits = encoded_length_bits+1
+    for i in range(bits - 1, 0, -1):
+        if len_translation_encoded & (1 << (i - 1)):
+            enc[current_byte] |= 1 << current_bit
+        if current_bit == 0:
+            current_bit = 7
+            #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
+            current_byte += 1
+        else:
+            current_bit -= 1
+
     for c in decompressed:
         #print()
         #print("char", c, values.index(c))
@@ -342,14 +377,17 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
 
     total_text_size = 0
     total_text_compressed_size = 0
+    max_translation_encoded_length = max(len(translation.encode("utf-8")) for original, translation in i18ns)
+    encoded_length_bits = max_translation_encoded_length.bit_length()
     for original, translation in i18ns:
         translation_encoded = translation.encode("utf-8")
-        compressed = compress(encoding_table, translation)
+        compressed = compress(encoding_table, translation, encoded_length_bits, len(translation_encoded))
         total_text_compressed_size += len(compressed)
-        decompressed = decompress(encoding_table, len(translation_encoded), compressed)
+        decompressed = decompress(encoding_table, compressed, encoded_length_bits)
+        assert decompressed == translation
         for c in C_ESCAPES:
             decompressed = decompressed.replace(c, C_ESCAPES[c])
-        print("TRANSLATION(\"{}\", {}, {{ {} }}) // {}".format(original, len(translation_encoded)+1, ", ".join(["0x{:02x}".format(x) for x in compressed]), decompressed))
+        print("TRANSLATION(\"{}\", {}) // {}".format(original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed))
         total_text_size += len(translation.encode("utf-8"))
 
     print()
@@ -385,6 +423,7 @@ def print_qstr_enums(qstrs):
 
     qcfgs, qstrs, i18ns = parse_input_headers(args.infiles)
     if args.translation:
+        i18ns = sorted(i18ns)
         translations = translate(args.translation, i18ns)
         encoding_table = compute_huffman_coding(translations, qstrs, args.compression_filename)
         print_qstr_data(encoding_table, qcfgs, qstrs, translations)

diff --git a/py/moduerrno.c b/py/moduerrno.c
@@ -158,7 +158,7 @@ const char *mp_common_errno_to_str(mp_obj_t errno_val, char *buf, size_t len) {
         case ENOSPC: desc = translate("No space left on device"); break;
         case EROFS:  desc = translate("Read-only filesystem"); break;
     }
-    if (desc != NULL && desc->length <= len) {
+    if (desc != NULL && decompress_length(desc) <= len) {
         decompress(desc, buf);
         return buf;
     }

diff --git a/py/obj.c b/py/obj.c
@@ -94,17 +94,17 @@ void mp_obj_print_exception(const mp_print_t *print, mp_obj_t exc) {
             assert(n % 3 == 0);
             // Decompress the format strings
             const compressed_string_t* traceback = translate("Traceback (most recent call last):\n");
-            char decompressed[traceback->length];
+            char decompressed[decompress_length(traceback)];
             decompress(traceback, decompressed);
 #if MICROPY_ENABLE_SOURCE_LINE
             const compressed_string_t* frame = translate("  File \"%q\", line %d");
 #else
             const compressed_string_t* frame = translate("  File \"%q\"");
 #endif
-            char decompressed_frame[frame->length];
+            char decompressed_frame[decompress_length(frame)];
             decompress(frame, decompressed_frame);
             const compressed_string_t* block_fmt = translate(", in %q\n");
-            char decompressed_block[block_fmt->length];
+            char decompressed_block[decompress_length(block_fmt)];
             decompress(block_fmt, decompressed_block);
 
             // Print the traceback

diff --git a/py/objexcept.c b/py/objexcept.c
@@ -400,7 +400,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
 
     // Try to allocate memory for the message
     mp_obj_str_t *o_str = m_new_obj_maybe(mp_obj_str_t);
-    size_t o_str_alloc = fmt->length + 1;
+    size_t o_str_alloc = decompress_length(fmt);
     byte *o_str_buf = m_new_maybe(byte, o_str_alloc);
 
     bool used_emg_buf = false;
@@ -433,7 +433,7 @@ mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, const com
         // We have some memory to format the string
         struct _exc_printer_t exc_pr = {!used_emg_buf, o_str_alloc, 0, o_str_buf};
         mp_print_t print = {&exc_pr, exc_add_strn};
-        char fmt_decompressed[fmt->length];
+        char fmt_decompressed[decompress_length(fmt)];
         decompress(fmt, fmt_decompressed);
         mp_vprintf(&print, fmt_decompressed, ap);
         exc_pr.buf[exc_pr.len] = '\0';

diff --git a/supervisor/shared/translate.c b/supervisor/shared/translate.c
@@ -37,7 +37,7 @@
 #include "supervisor/serial.h"
 
 void serial_write_compressed(const compressed_string_t* compressed) {
-    char decompressed[compressed->length];
+    char decompressed[decompress_length(compressed)];
     decompress(compressed, decompressed);
     serial_write(decompressed);
 }
@@ -58,12 +58,22 @@ STATIC int put_utf8(char *buf, int u) {
     }
 }
 
+uint16_t decompress_length(const compressed_string_t* compressed) {
+    if (compress_max_length_bits <= 8) {
+        return 1 + (compressed->data >> (8 - compress_max_length_bits));
+    } else {
+        return 1 + ((compressed->data * 256 + compressed->tail[0]) >> (16 - compress_max_length_bits));
+    }
+}
+
 char* decompress(const compressed_string_t* compressed, char* decompressed) {
-    uint8_t this_byte = 0;
-    uint8_t this_bit = 7;
-    uint8_t b = compressed->data[this_byte];
+    uint8_t this_byte = compress_max_length_bits / 8;
+    uint8_t this_bit = 7 - compress_max_length_bits % 8;
+    uint8_t b = (&compressed->data)[this_byte];
+    uint16_t length = decompress_length(compressed);
+
     // Stop one early because the last byte is always NULL.
-    for (uint16_t i = 0; i < compressed->length - 1;) {
+    for (uint16_t i = 0; i < length - 1;) {
         uint32_t bits = 0;
         uint8_t bit_length = 0;
         uint32_t max_code = lengths[0];
@@ -78,7 +88,7 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
             if (this_bit == 0) {
                 this_bit = 7;
                 this_byte += 1;
-                b = compressed->data[this_byte]; // This may read past the end but its never used.
+                b = (&compressed->data)[this_byte]; // This may read past the end but its never used.
             } else {
                 this_bit -= 1;
             }
@@ -91,14 +101,14 @@ char* decompress(const compressed_string_t* compressed, char* decompressed) {
         i += put_utf8(decompressed + i, values[searched_length + bits - max_code]);
     }
 
-    decompressed[compressed->length-1] = '\0';
+    decompressed[length-1] = '\0';
     return decompressed;
 }
 
 inline __attribute__((always_inline)) const compressed_string_t* translate(const char* original) {
     #ifndef NO_QSTR
     #define QDEF(id, str)
-    #define TRANSLATION(id, len, compressed...) if (strcmp(original, id) == 0) { static const compressed_string_t v = {.length = len, .data = compressed}; return &v; } else
+    #define TRANSLATION(id, firstbyte, ...) if (strcmp(original, id) == 0) { static const compressed_string_t v = { .data = firstbyte, .tail = { __VA_ARGS__ } }; return &v; } else
     #include "genhdr/qstrdefs.generated.h"
     #undef TRANSLATION
     #undef QDEF

diff --git a/supervisor/shared/translate.h b/supervisor/shared/translate.h
@@ -29,13 +29,41 @@
 
 #include <stdint.h>
 
+// The format of the compressed data is:
+// - the size of the uncompressed string in UTF-8 bytes, encoded as a
+//   (compress_max_length_bits)-bit number.  compress_max_length_bits is
+//   computed during dictionary generation time, and happens to be 8
+//   for all current platforms.  However, it'll probably end up being
+//   9 in some translations sometime in the future.  This length excludes
+//   the trailing NUL, though notably decompress_length includes it.
+//
+// - followed by the huffman encoding of the individual UTF-16 code
+//   points that make up the string.  The trailing "\0" is not
+//   represented by a huffman code, but is implied by the length.
+//   (building the huffman encoding on UTF-16 code points gave better
+//   compression than building it on UTF-8 bytes)
+//
+// The "data" / "tail" construct is so that the struct's last member is a
+// "flexible array".  However, the _only_ member is not permitted to be
+// a flexible member, so we have to declare the first byte as a separte
+// member of the structure.
+//
+// For translations where length needs 8 bits, this saves about 1.5
+// bytes per string on average compared to a structure of {uint16_t,
+// flexible array}, but is also future-proofed against strings with
+// UTF-8 length above 256, with a savings of about 1.375 bytes per
+// string.
 typedef struct {
-    uint16_t length;
-    const uint8_t data[];
+    uint8_t data;
+    const uint8_t tail[];
 } compressed_string_t;
 
+// Return the compressed, translated version of a source string
+// Usually, due to LTO, this is optimized into a load of a constant
+// pointer.
 const compressed_string_t* translate(const char* c);
 void serial_write_compressed(const compressed_string_t* compressed);
 char* decompress(const compressed_string_t* compressed, char* decompressed);
+uint16_t decompress_length(const compressed_string_t* compressed);
 
 #endif  // MICROPY_INCLUDED_SUPERVISOR_TRANSLATE_H