From 899413076184fc7f44e41671ff247861285fcadd Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Wed, 13 Nov 2024 22:09:03 +0100 Subject: [PATCH 1/8] Do not extract messages from function definitions. Fixes a bug where pygettext would attempt to extract a message from a code like this: def _(x): pass This is because pygettext only looks at one token at a time and '_(x)' looks like a function call. However, since 'x' is not a string literal, it would erroneously issue a warning. This commit fixes that by keeping track of the previous token and checking if it's 'def' or 'class'. --- Lib/test/test_tools/test_i18n.py | 36 +++++++++++++++++++++++++++----- Tools/i18n/pygettext.py | 20 +++++++++++------- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 21dead8f943bb7..eceda523a41ba4 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -87,17 +87,26 @@ def assert_POT_equal(self, expected, actual): self.maxDiff = None self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual)) - def extract_docstrings_from_str(self, module_content): - """ utility: return all msgids extracted from module_content """ - filename = 'test_docstrings.py' - with temp_cwd(None) as cwd: + def extract_from_str(self, module_content, *, args=(), strict=True): + filename = 'test.py' + with temp_cwd(None): with open(filename, 'w', encoding='utf-8') as fp: fp.write(module_content) - assert_python_ok('-Xutf8', self.script, '-D', filename) + res = assert_python_ok('-Xutf8', self.script, *args, filename) + if strict: + self.assertEqual(res.err, b'') with open('messages.pot', encoding='utf-8') as fp: data = fp.read() return self.get_msgids(data) + def extract_docstrings_from_str(self, module_content): + """Return all docstrings extracted from module_content.""" + return self.extract_from_str(module_content, args=('--docstrings',), strict=False) + + def extract_messages_from_str(self, module_content): + """Return all msgids extracted from module_content.""" + return self.extract_from_str(module_content) + def test_header(self): """Make sure the required fields are in the header, according to: http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry @@ -344,6 +353,23 @@ def test_calls_in_fstring_with_partially_wrong_expression(self): self.assertNotIn('foo', msgids) self.assertIn('bar', msgids) + def test_function_and_class_names(self): + """Test that function and class names are not mistakenly extracted.""" + msgids = self.extract_messages_from_str(dedent('''\ + def _(x): + pass + + def _(x="foo"): + pass + + async def _(x): + pass + + class _(object): + pass + ''')) + self.assertEqual(msgids, ['']) + def test_pygettext_output(self): """Test that the pygettext output exactly matches snapshots.""" for input_file in DATA_DIR.glob('*.py'): diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 3a0b27ba420e7a..a2a02142edc936 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -5,7 +5,7 @@ # Minimally patched to make it even more xgettext compatible # by Peter Funk # -# 2002-11-22 J�rgen Hermann +# 2002-11-22 Jürgen Hermann # Added checks that _() only contains string literals, and # command line args are resolved to module lists, i.e. you # can now pass a filename, a module or package name, or a @@ -207,7 +207,7 @@ def make_escapes(pass_nonascii): global escapes, escape if pass_nonascii: # Allow non-ascii characters to pass through so that e.g. 'msgid - # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we + # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 escape = escape_ascii @@ -306,6 +306,11 @@ def getFilesForName(name): return [] +def _is_def_or_class_keyword(token): + ttype, tstring, *_ = token + return ttype == tokenize.NAME and tstring in ('def', 'class') + + class TokenEater: def __init__(self, options): self.__options = options @@ -316,13 +321,11 @@ def __init__(self, options): self.__freshmodule = 1 self.__curfile = None self.__enclosurecount = 0 + self.__prev_token = None def __call__(self, ttype, tstring, stup, etup, line): - # dispatch -## import token -## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, -## file=sys.stderr) self.__state(ttype, tstring, stup[0]) + self.__prev_token = (ttype, tstring, stup, etup, line) def __waiting(self, ttype, tstring, lineno): opts = self.__options @@ -341,7 +344,10 @@ def __waiting(self, ttype, tstring, lineno): if ttype == tokenize.NAME and tstring in ('class', 'def'): self.__state = self.__suiteseen return - if ttype == tokenize.NAME and tstring in opts.keywords: + if ( + ttype == tokenize.NAME and tstring in opts.keywords + and (not self.__prev_token or not _is_def_or_class_keyword(self.__prev_token)) + ): self.__state = self.__keywordseen return if ttype == tokenize.STRING: From c8d153810c623170917a5ff7021cabc3d71dde44 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Wed, 13 Nov 2024 22:27:16 +0100 Subject: [PATCH 2/8] Add news entry --- .../Tools-Demos/2024-11-13-22-23-36.gh-issue-126807.vpaWuN.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2024-11-13-22-23-36.gh-issue-126807.vpaWuN.rst diff --git a/Misc/NEWS.d/next/Tools-Demos/2024-11-13-22-23-36.gh-issue-126807.vpaWuN.rst b/Misc/NEWS.d/next/Tools-Demos/2024-11-13-22-23-36.gh-issue-126807.vpaWuN.rst new file mode 100644 index 00000000000000..310286ce8319ea --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2024-11-13-22-23-36.gh-issue-126807.vpaWuN.rst @@ -0,0 +1,2 @@ +Fix extraction warnings in :program:`pygettext.py` caused by mistaking +function definitions for function calls. From 6cc0833d23982f0b87c4fc70ef59ab7f196a3e9d Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Wed, 13 Nov 2024 22:31:45 +0100 Subject: [PATCH 3/8] Remove 'coding:' directive --- Tools/i18n/pygettext.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index a2a02142edc936..a23ced10ee1d0f 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# -*- coding: iso-8859-1 -*- # Originally written by Barry Warsaw # # Minimally patched to make it even more xgettext compatible From 6e0cd5029d90e2b6e055a0dac37d719c9473684b Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 14 Nov 2024 21:54:23 +0100 Subject: [PATCH 4/8] Simplify test --- Lib/test/test_tools/test_i18n.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index eceda523a41ba4..6f71f0976819f1 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -88,6 +88,7 @@ def assert_POT_equal(self, expected, actual): self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual)) def extract_from_str(self, module_content, *, args=(), strict=True): + """Return all msgids extracted from module_content.""" filename = 'test.py' with temp_cwd(None): with open(filename, 'w', encoding='utf-8') as fp: @@ -103,10 +104,6 @@ def extract_docstrings_from_str(self, module_content): """Return all docstrings extracted from module_content.""" return self.extract_from_str(module_content, args=('--docstrings',), strict=False) - def extract_messages_from_str(self, module_content): - """Return all msgids extracted from module_content.""" - return self.extract_from_str(module_content) - def test_header(self): """Make sure the required fields are in the header, according to: http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry @@ -355,7 +352,7 @@ def test_calls_in_fstring_with_partially_wrong_expression(self): def test_function_and_class_names(self): """Test that function and class names are not mistakenly extracted.""" - msgids = self.extract_messages_from_str(dedent('''\ + msgids = self.extract_from_str(dedent('''\ def _(x): pass From 249db28533316f5cfee88aaeaafcf3dca9e68091 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 14 Nov 2024 22:22:02 +0100 Subject: [PATCH 5/8] Revert unrelated changes --- Tools/i18n/pygettext.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index a23ced10ee1d0f..1002c2edd1fc42 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -1,10 +1,11 @@ #! /usr/bin/env python3 +# -*- coding: iso-8859-1 -*- # Originally written by Barry Warsaw # # Minimally patched to make it even more xgettext compatible # by Peter Funk # -# 2002-11-22 Jürgen Hermann +# 2002-11-22 J�rgen Hermann # Added checks that _() only contains string literals, and # command line args are resolved to module lists, i.e. you # can now pass a filename, a module or package name, or a @@ -206,7 +207,7 @@ def make_escapes(pass_nonascii): global escapes, escape if pass_nonascii: # Allow non-ascii characters to pass through so that e.g. 'msgid - # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we + # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 escape = escape_ascii @@ -323,6 +324,10 @@ def __init__(self, options): self.__prev_token = None def __call__(self, ttype, tstring, stup, etup, line): + # dispatch +## import token +## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, +## file=sys.stderr) self.__state(ttype, tstring, stup[0]) self.__prev_token = (ttype, tstring, stup, etup, line) From e56333181fe002306d6fcda43ce39ff2aa7a19be Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 14 Nov 2024 22:25:25 +0100 Subject: [PATCH 6/8] Use an extra state instead of prev_token --- Tools/i18n/pygettext.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 1002c2edd1fc42..cbd1137572ff6e 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -321,7 +321,6 @@ def __init__(self, options): self.__freshmodule = 1 self.__curfile = None self.__enclosurecount = 0 - self.__prev_token = None def __call__(self, ttype, tstring, stup, etup, line): # dispatch @@ -329,7 +328,6 @@ def __call__(self, ttype, tstring, stup, etup, line): ## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, ## file=sys.stderr) self.__state(ttype, tstring, stup[0]) - self.__prev_token = (ttype, tstring, stup, etup, line) def __waiting(self, ttype, tstring, lineno): opts = self.__options @@ -348,10 +346,10 @@ def __waiting(self, ttype, tstring, lineno): if ttype == tokenize.NAME and tstring in ('class', 'def'): self.__state = self.__suiteseen return - if ( - ttype == tokenize.NAME and tstring in opts.keywords - and (not self.__prev_token or not _is_def_or_class_keyword(self.__prev_token)) - ): + if ttype == tokenize.NAME and tstring in ('class', 'def'): + self.__state = self.__ignorenext + return + if ttype == tokenize.NAME and tstring in opts.keywords: self.__state = self.__keywordseen return if ttype == tokenize.STRING: @@ -458,6 +456,9 @@ def __openseen(self, ttype, tstring, lineno): }, file=sys.stderr) self.__state = self.__waiting + def __ignorenext(self, ttype, tstring, lineno): + self.__state = self.__waiting + def __addentry(self, msg, lineno=None, isdocstring=0): if lineno is None: lineno = self.__lineno From 7b37aa2c01971764f3745d2d29bb2265ad070e83 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 14 Nov 2024 22:26:34 +0100 Subject: [PATCH 7/8] Remove unused function --- Tools/i18n/pygettext.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index cbd1137572ff6e..45c46d6cbfdd94 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -306,11 +306,6 @@ def getFilesForName(name): return [] -def _is_def_or_class_keyword(token): - ttype, tstring, *_ = token - return ttype == tokenize.NAME and tstring in ('def', 'class') - - class TokenEater: def __init__(self, options): self.__options = options From fa0772e0917e4dfd7a7d1e65926f26dc1a337c6b Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 14 Nov 2024 22:45:54 +0100 Subject: [PATCH 8/8] Fix character encoding --- Tools/i18n/pygettext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 45c46d6cbfdd94..0d16e8f7da0071 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -5,7 +5,7 @@ # Minimally patched to make it even more xgettext compatible # by Peter Funk # -# 2002-11-22 J�rgen Hermann +# 2002-11-22 J�rgen Hermann # Added checks that _() only contains string literals, and # command line args are resolved to module lists, i.e. you # can now pass a filename, a module or package name, or a @@ -207,7 +207,7 @@ def make_escapes(pass_nonascii): global escapes, escape if pass_nonascii: # Allow non-ascii characters to pass through so that e.g. 'msgid - # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we + # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we # escape any character outside the 32..126 range. mod = 128 escape = escape_ascii