diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index e8167acfc0742b..e2a230fd30e9d3 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -33,65 +33,77 @@ msgid "" " multiline!\n" msgstr "" -#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94 -#: messages.py:99 messages.py:100 messages.py:101 +#: messages.py:32 +msgid "" +"this is a very very very very very very very very very very very very very " +"long string!" +msgstr "" + +#: messages.py:33 +msgid "" +"this is a very very very very very very very very very " +"very very very very long string with weird spaces!" +msgstr "" + +#: messages.py:50 messages.py:93 messages.py:94 messages.py:97 messages.py:98 +#: messages.py:103 messages.py:104 messages.py:105 msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:47 +#: messages.py:51 msgid "something" msgstr "" -#: messages.py:50 +#: messages.py:54 msgid "Hello, {}!" msgstr "" -#: messages.py:54 +#: messages.py:58 msgid "1" msgstr "" -#: messages.py:54 +#: messages.py:58 msgid "2" msgstr "" -#: messages.py:55 messages.py:56 +#: messages.py:59 messages.py:60 msgid "A" msgstr "" -#: messages.py:55 messages.py:56 +#: messages.py:59 messages.py:60 msgid "B" msgstr "" -#: messages.py:57 +#: messages.py:61 msgid "set" msgstr "" -#: messages.py:62 messages.py:63 +#: messages.py:66 messages.py:67 msgid "nested string" msgstr "" -#: messages.py:68 +#: messages.py:72 msgid "baz" msgstr "" -#: messages.py:71 messages.py:75 +#: messages.py:75 messages.py:79 msgid "default value" msgstr "" -#: messages.py:91 messages.py:92 messages.py:95 messages.py:96 +#: messages.py:95 messages.py:96 messages.py:99 messages.py:100 msgctxt "context" msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:102 +#: messages.py:106 msgid "domain foo" msgstr "" -#: messages.py:118 messages.py:119 +#: messages.py:122 messages.py:123 msgid "world" msgid_plural "worlds" msgstr[0] "" diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index 9457bcb8611020..e4a1c5e60f1e5a 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -28,6 +28,10 @@ multiline! """) +# very long strings that should be wrapped by normalize +_("this is a very very very very very very very very very very very very very long string!") +_("this is a very very very very very very very very very very very very very long string with weird spaces!") + # Invalid arguments _() _(None) diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index d73fcff4c9cb11..c9002c124fa433 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -5,6 +5,7 @@ import sys import unittest from textwrap import dedent +from types import SimpleNamespace from pathlib import Path from test.support.script_helper import assert_python_ok @@ -18,7 +19,7 @@ with imports_under_tool("i18n"): - from pygettext import parse_spec + from pygettext import parse_spec, make_escapes, normalize def normalize_POT_file(pot): @@ -516,6 +517,51 @@ def test_parse_keyword_spec(self): parse_spec(spec) self.assertEqual(str(cm.exception), message) + def setUp(self): + # required to set up normalize + make_escapes(True) + + def test_normalize_multiline(self): + s = 'multi-line\n translation' + s_expected = '""\n"multi-line\\n"\n" translation"' + + data = normalize(s, 'UTF-8', 'msgid', 78) + self.assertEqual(s_expected, data) + + def test_normalize_wrap(self): + cases = ( + ('multi-line\n translation', '""\n"multi-line\\n"\n" translation"'), + ('fee fi fo fum fee fi ', '"fee fi fo fum fee fi "'), # len = 29 + ('fee fi fo fum fee fi f', '"fee fi fo fum fee fi f"'), # len = 30 + ('fee fi fo fum fee fi fo', '""\n"fee fi fo fum fee fi fo"' ),# len = 31 + ) + for raw, expected in cases: + with self.subTest(raw): + data = normalize(raw, 'UTF-8', 'msgid', 30) + self.assertEqual(expected, data) + + def test_normalize_empty_str(self): + data = normalize('', 'UTF-8', 'msgid', 30) + self.assertEqual('""', data) + + def test_normalize_single_word(self): + for s in ("fee", "fi", "fo", "fums"): + data = normalize(s, 'UTF-8', 'msgid', 8) + self.assertNotIn('""', data) # did not wrap + + def test_normalize_split_on_whitespace(self): + for space in (' ', ' ', ' ', '\t', '\r'): + s = f'longlonglong{space}word' + space = {'\t': '\\t', '\r': '\\r'}.get(space, space) + s_expected = f'""\n"longlonglong{space}"\n"word"' + data = normalize(s, 'UTF-8', 'msgid', 10) + self.assertEqual(s_expected, data) + + s = f'longlonglong\r\nword' + s_expected = f'""\n"longlonglong\\r\\n"\n"word"' + data = normalize(s, 'UTF-8', 'msgid', 30) + self.assertEqual(s_expected, data) + def extract_from_snapshots(): snapshots = { diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-02-28-19-30-00.gh-issue-130703.ajhd21.rst b/Misc/NEWS.d/next/Tools-Demos/2025-02-28-19-30-00.gh-issue-130703.ajhd21.rst new file mode 100644 index 00000000000000..a4156699f8500f --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-02-28-19-30-00.gh-issue-130703.ajhd21.rst @@ -0,0 +1 @@ +Wrap msgids to specified ``width`` in :program:`pygettext`. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 0f5f32c7d6c18f..9e2fef22cea328 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -145,6 +145,7 @@ import importlib.machinery import importlib.util import os +import re import sys import time import tokenize @@ -154,6 +155,7 @@ __version__ = '1.5' +from test.test_doctest.test_doctest import wrapped # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's # there. @@ -213,21 +215,40 @@ def escape_nonascii(s, encoding): return ''.join(escapes[b] for b in s.encode(encoding)) -def normalize(s, encoding): +_space_splitter = re.compile(r'\s+|\S+\s*') + +def normalize(s, encoding, prefix, width): # This converts the various Python string types into a format that is - # appropriate for .po files, namely much closer to C style. - lines = s.split('\n') - if len(lines) == 1: - s = '"' + escape(s, encoding) + '"' - else: - if not lines[-1]: - del lines[-1] - lines[-1] = lines[-1] + '\n' - for i in range(len(lines)): - lines[i] = escape(lines[i], encoding) - lineterm = '\\n"\n"' - s = '""\n"' + lineterm.join(lines) + '"' - return s + # appropriate for .po files, namely much closer to C style, + # while wrapping to options.width. + lines = [] + wrap = False + for line in s.splitlines(True): + escaped_line = escape(line, encoding) + if len(escaped_line) + len(prefix) + 3 > width: + wrap = True + words = _space_splitter.findall(line) + words.reverse() + buf = [] + size = 0 + while words: + word = words.pop() + escaped_word = escape(word, encoding) + escaped_word_len = len(escaped_word) + new_size = size + escaped_word_len + if new_size + 2 <= width or not buf: + buf.append(escaped_word) + size = new_size + else: + lines.append(''.join(buf)) + buf = [escaped_word] + size = escaped_word_len + lines.append(''.join(buf)) + else: + lines.append(escaped_line) + if len(lines) <= 1 and (not wrap or len(_space_splitter.findall(lines[0])) == 1): + return f'"{escape(s, encoding)}"' + return '""\n' + '\n'.join(f'"{line}"' for line in lines) def containsAny(str, set): @@ -618,10 +639,10 @@ def write_pot_file(messages, options, fp): # to skip translating some unimportant docstrings. print('#, docstring', file=fp) if msg.msgctxt is not None: - print('msgctxt', normalize(msg.msgctxt, encoding), file=fp) - print('msgid', normalize(msg.msgid, encoding), file=fp) + print('msgctxt', normalize(msg.msgctxt, encoding, 'msgctxt', options.width), file=fp) + print('msgid', normalize(msg.msgid, encoding, 'msgid', options.width), file=fp) if msg.msgid_plural is not None: - print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp) + print('msgid_plural', normalize(msg.msgid_plural, encoding, 'msgid_plural', options.width), file=fp) print('msgstr[0] ""', file=fp) print('msgstr[1] ""\n', file=fp) else: