diff --git a/Lib/test/test_tools/i18n_data/exclude_file.pot b/Lib/test/test_tools/i18n_data/exclude_file.pot new file mode 100644 index 00000000000000..4c94fdde65df51 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/exclude_file.pot @@ -0,0 +1,38 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2000-01-01 00:00+0000\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + + +msgid "excluded" +msgstr "" + +msgid "multiline\nexcluded" +msgstr "" + +msgid "singular excluded" +msgid_plural "plural excluded" +msgstr[0] "" +msgstr[1] "" + +msgctxt "context" +msgid "context excluded" +msgstr "" + +msgctxt "context" +msgid "context singular excluded" +msgid_plural "context plural excluded" +msgstr[0] "" +msgstr[1] "" + diff --git a/Lib/test/test_tools/i18n_data/excluded.pot b/Lib/test/test_tools/i18n_data/excluded.pot new file mode 100644 index 00000000000000..c3037b24e5cb87 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/excluded.pot @@ -0,0 +1,21 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2000-01-01 00:00+0000\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + + +#: excluded.py:4 +msgid "foo" +msgstr "" + diff --git a/Lib/test/test_tools/i18n_data/excluded.py b/Lib/test/test_tools/i18n_data/excluded.py new file mode 100644 index 00000000000000..a9249c6fa540c2 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/excluded.py @@ -0,0 +1,14 @@ +from gettext import gettext as _, ngettext, npgettext, pgettext + + +_('foo') + +_('excluded') + +_('multiline\nexcluded') + +ngettext('singular excluded', 'plural excluded', 2) + +pgettext('context', 'context excluded') + +npgettext('context', 'context singular excluded', 'context plural excluded', 2) diff --git a/Lib/test/test_tools/i18n_data/general.json b/Lib/test/test_tools/i18n_data/general.json new file mode 100644 index 00000000000000..42f8470784ede3 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/general.json @@ -0,0 +1,103 @@ +[ + { + "msgctxt": null, + "msgid": "", + "msgid_plural": null, + "msgstr": "Project-Id-Version: PACKAGE VERSION\nPOT-Creation-Date: 2024-10-26 18:06+0200\nPO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\nLast-Translator: FULL NAME \nLanguage-Team: LANGUAGE \nMIME-Version: 1.0\nContent-Type: text/plain; charset=UTF-8\nContent-Transfer-Encoding: 8bit\n" + }, + { + "msgctxt": null, + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": null, + "msgid": "Escape sequences \" \n \t", + "msgid_plural": null, + "msgstr": "" + }, + { + "msgctxt": null, + "msgid": "Python", + "msgid_plural": null, + "msgstr": "Python" + }, + { + "msgctxt": null, + "msgid": "Python (2)", + "msgid_plural": null, + "msgstr": "Python (2)" + }, + { + "msgctxt": null, + "msgid": "αβ", + "msgid_plural": null, + "msgstr": "αβ" + }, + { + "msgctxt": null, + "msgid": "barbaz", + "msgid_plural": null, + "msgstr": "quxxyz" + }, + { + "msgctxt": null, + "msgid": "xyz", + "msgid_plural": null, + "msgstr": "" + }, + { + "msgctxt": "context", + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": "xyz", + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": null, + "msgid": "One email sent.", + "msgid_plural": "%d emails sent.", + "msgstr": [ + "One email sent.", + "%d emails sent." + ] + }, + { + "msgctxt": null, + "msgid": "One message sent.", + "msgid_plural": "%d messages sent.", + "msgstr": [ + "%d message sent." + ] + }, + { + "msgctxt": "abc", + "msgid": "One email sent.", + "msgid_plural": "%d emails sent.", + "msgstr": [ + "One email sent.", + "%d emails sent." + ] + }, + { + "msgctxt": null, + "msgid": "qux", + "msgid_plural": "quxs", + "msgstr": [ + "abc", + "xyz" + ] + }, + { + "msgctxt": null, + "msgid": "baz", + "msgid_plural": null, + "msgstr": "" + } +] \ No newline at end of file diff --git a/Lib/test/test_tools/i18n_data/general.po b/Lib/test/test_tools/i18n_data/general.po new file mode 100644 index 00000000000000..e86725aa93acbd --- /dev/null +++ b/Lib/test/test_tools/i18n_data/general.po @@ -0,0 +1,85 @@ +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2024-10-26 18:06+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "foo" +msgstr "bar" + +# Selected escape sequences are allowed +msgid "Escape sequences \" \n \t" +msgstr "" + +# Octal escape sequences are allowed +msgid "\120\171\164\150\157\156" +msgstr "Python" + +# Hex escape sequences are allowed +msgid "\x50\x79\x74\x68\x6f\x6e (2)" +msgstr "Python (2)" + +# non-ascii +msgid "αβ" +msgstr "αβ" + +# Empty lines are ignored +msgid "" +"bar" + +"baz" +msgstr "" +"qux" + +"xyz" + +# Keyword line does not need to contain a string +msgid +"xyz" +msgstr "" + +# comment +# comment + +msgctxt "context" +msgid "foo" +msgstr "bar" + +msgctxt "xyz" +msgid "foo" +msgstr "bar" + +msgid "One email sent." +msgid_plural "%d emails sent." +msgstr[0] "One email sent." +msgstr[1] "%d emails sent." + +# Each message can have a different number of msgstr[N] +msgid "One message sent." +msgid_plural "%d messages sent." +msgstr[0] "%d message sent." + +msgctxt "abc" +msgid "One email sent." +msgid_plural "%d emails sent." +msgstr[0] "One email sent." +msgstr[1] "%d emails sent." + +msgid "qux" +msgid_plural "quxs" +msgstr[0] "a" +"b" +"c" +msgstr[1] "x" +"y" +"z" + +msgid "baz" +msgstr "" + +# trailing comments are allowed diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 66c33077423229..650af9f733e973 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -1,5 +1,7 @@ """Tests to cover the Tools/i18n package""" +import codecs +import json import os import re import sys @@ -7,7 +9,7 @@ from textwrap import dedent from pathlib import Path -from test.support.script_helper import assert_python_ok +from test.support.script_helper import assert_python_failure, assert_python_ok from test.test_tools import imports_under_tool, skip_if_missing, toolsdir from test.support.os_helper import temp_cwd, temp_dir @@ -18,7 +20,7 @@ with imports_under_tool("i18n"): - from pygettext import parse_spec + from pygettext import parse_po, parse_quoted_strings, parse_spec def normalize_POT_file(pot): @@ -516,8 +518,256 @@ def test_parse_keyword_spec(self): parse_spec(spec) self.assertEqual(str(cm.exception), message) + def test_missing_exclude_file(self): + """ + Test that an error is raised if the exclude file (passed via + --exclude-file) does not exist. + """ + _, _, stderr = assert_python_failure(self.script, + '--exclude-file=foo.po') + self.assertIn("Can't read --exclude-file: foo.po", + stderr.decode('utf-8')) + + def test_invalid_exclude_file(self): + """ + Test that an error is raised if the exclude file (passed via + --exclude-file) is not a valid PO file. + """ + with temp_cwd(None): + # Create an invalid PO file + Path('invalid.po').write_text('Invalid PO file', encoding='utf-8') + + _, _, stderr = assert_python_failure(self.script, + '--exclude-file=invalid.po') + self.assertIn("Invalid exclude file (invalid.po):", + stderr.decode('utf-8')) + + +class TestPOParser(unittest.TestCase): + def test_parse_quoted_strings(self): + class DummyState: + filename = 'foo.po' + lineno = 1 + + valid_strings = ( + # no strings + ('', ''), + (' ', ''), + ('\t', ''), + # empty strings + ('""', ''), + ('"" "" ""', ''), + # allowed escape sequences + (r'"\\"', '\\'), + (r'"\""', '"'), + (r'"\t"', '\t'), + (r'"\n"', '\n'), + (r'"\r"', '\r'), + (r'"\f"', '\f'), + (r'"\a"', '\a'), + (r'"\b"', '\b'), + (r'"\v"', '\v'), + # non-empty strings + ('"foo"', 'foo'), + ('"foo" "bar"', 'foobar'), + ('"foo""bar"', 'foobar'), + ('"" "foo" ""', 'foo'), + # newlines and tabs + (r'"foo\nbar"', 'foo\nbar'), + (r'"foo\n" "bar"', 'foo\nbar'), + (r'"foo\tbar"', 'foo\tbar'), + (r'"foo\t" "bar"', 'foo\tbar'), + # escaped quotes + (r'"foo\"bar"', 'foo"bar'), + (r'"foo\"" "bar"', 'foo"bar'), + (r'"foo\\" "bar"', 'foo\\bar'), + # octal escapes + (r'"\120\171\164\150\157\156"', 'Python'), + (r'"\120\171\164" "\150\157\156"', 'Python'), + (r'"\"\120\171\164" "\150\157\156\""', '"Python"'), + # hex escapes + (r'"\x50\x79\x74\x68\x6f\x6e"', 'Python'), + (r'"\x50\x79\x74" "\x68\x6f\x6e"', 'Python'), + (r'"\"\x50\x79\x74" "\x68\x6f\x6e\""', '"Python"'), + ) + for string, expected in valid_strings: + with self.subTest(string=string): + parsed = parse_quoted_strings(DummyState(), string) + self.assertEqual(parsed, expected) + + invalid_strings = ( + "''", + '"', + '"""', + '"" "', + 'foo', + '"" "foo', + '"foo" foo', + '42', + '"" 42 ""', + # disallowed escape sequences + r"\'", + r'"\e"', + r'"\8"', + r'"\9"', + r'"\x"', + r'\u1234', + r'"\N{ROMAN NUMERAL NINE}"' + ) + for string in invalid_strings: + with self.subTest(string=string): + with self.assertRaises(ValueError): + parse_quoted_strings(DummyState(), string) + + def test_semantic_errors(self): + pos = ( + # parse_po + ('msgctxt "foo"', 'Missing msgid after msgctxt'), + ('msgid "foo"', 'Missing msgstr after msgid'), + # parse_comment + ('msgctxt "foo"\n# comment', + 'Comment line not allowed after msgctxt'), + ('msgid "foo"\n# comment', + 'Comment line not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\n# comment', + 'Comment line not allowed after msgid_plural'), + # parse_msgctxt + ('msgctxt "foo"\nmsgctxt "bar"', + 'msgctxt not allowed after msgctxt'), + ('msgid "foo"\nmsgctxt "bar"', 'msgctxt not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgctxt "bar"', + 'msgctxt not allowed after msgid_plural'), + # parse_msgid + ('msgid "foo"\nmsgid "bar"', 'msgid not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgid "bar"', + 'msgid not allowed after msgid_plural'), + # parse_msgid_plural + ('msgid_plural "foos"', 'msgid_plural must be preceded by msgid'), + ('# comment\nmsgid_plural "foos"', + 'msgid_plural not allowed after comment'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgid_plural "bars"', + 'msgid_plural not allowed after msgid_plural'), + ('msgctxt "foo"\nmsgid_plural "foos"', + 'msgid_plural not allowed after msgctxt'), + ('msgid "foo"\nmsgstr "bar"\nmsgid_plural "foos"', + 'msgid_plural not allowed after msgstr'), + # parse_msgstr + ('msgstr "foo"', 'msgstr must be preceded by msgid'), + ('# comment\nmsgstr "foo"', 'msgstr not allowed after comment'), + ('msgctxt "foo"\nmsgstr "bar"', + 'msgstr not allowed after msgctxt'), + ('msgid "foo"\nmsgstr "bar"\nmsgstr "baz"', + 'msgstr not allowed after msgstr'), + # parse_line + ('"foo"', 'Syntax error before:'), + ('# comment\n"foo"', 'Syntax error before:'), + ) + for po, message in pos: + with self.subTest(po=po): + with self.assertRaises(ValueError) as cm: + parse_po(po.encode('utf-8'), 'foo.po') + self.assertIn(message, str(cm.exception)) + + def test_msgstr_invalid_indices(self): + pos = ( + (''' +msgid "foo" +msgstr[0] "bar" +''', 'Missing msgid_plural section'), + (''' +msgid "foo" +msgid_plural "foos" +msgstr[0] "bar" +msgstr[42] "bars" +''', "Plural form has incorrect index, found '42' but should be '1'"), + (''' +msgid "foo" +msgid_plural "foos" +msgstr "bar" +''', "Indexed msgstr required after msgid_plural"), + ) + for po, message in pos: + with self.subTest(po=po): + with self.assertRaises(ValueError) as cm: + parse_po(po.encode('utf-8'), 'foo.po') + self.assertIn(message, str(cm.exception)) + + def test_duplicate_entries(self): + po = b''' +msgid "foo" +msgstr "bar" + +msgid "foo" +msgstr "baz" +''' + with self.assertRaisesRegex(ValueError, "Duplicate entry: 'foo'"): + parse_po(po, 'foo.po') + + po = b''' +msgctxt "context" +msgid "foo" +msgstr "bar" + +msgctxt "context" +msgid "foo" +msgstr "baz" +''' + with self.assertRaises(ValueError) as cm: + parse_po(po, 'foo.po') + self.assertIn("Duplicate entry: ('context', 'foo')", str(cm.exception)) + + def test_encoding(self): + po = r''' +msgid "" +msgstr "" +"Content-Type: text/plain; charset=UTF-8\n" + +msgid "αβ" +msgstr "αβ" +''' + expected = [{ + 'msgctxt': None, + 'msgid': '', + 'msgid_plural': None, + 'msgstr': 'Content-Type: text/plain; charset=UTF-8\n', + }, { + 'msgctxt': None, + 'msgid': 'αβ', + 'msgid_plural': None, + 'msgstr': 'αβ', + }] + self.assertEqual(parse_po(po.encode('utf-8'), 'foo.po'), expected) + + def test_missing_encoding(self): + po = ''' +msgid "αβ" +msgstr "αβ" +''' + ab = "αβ".encode('utf-8').decode('latin-1') + expected = [{ + 'msgctxt': None, + 'msgid': ab, + 'msgid_plural': None, + 'msgstr': ab, + }] + self.assertEqual(parse_po(po.encode('utf-8'), 'foo.po'), expected) + + def test_invalid_BOM(self): + po = codecs.BOM_UTF8 + b'msgid "foo"\nmsgstr "bar"' + with self.assertRaises(ValueError) as cm: + parse_po(po, 'foo.po') + self.assertIn("starts with a UTF-8 BOM", str(cm.exception)) + + def test_parse(self): + filename = DATA_DIR / 'general.po' + messages = parse_po(filename.read_bytes(), filename) + expected = json.loads( + (DATA_DIR / 'general.json').read_text(encoding='utf-8')) + self.assertEqual(messages, expected) + def extract_from_snapshots(): + exclude_file = DATA_DIR / 'exclude_file.pot' snapshots = { 'messages.py': (), 'fileloc.py': ('--docstrings',), @@ -526,6 +776,8 @@ def extract_from_snapshots(): 'custom_keywords.py': ('--keyword=foo', '--keyword=nfoo:1,2', '--keyword=pfoo:1c,2', '--keyword=npfoo:1c,2,3', '--keyword=_:1,2'), + # Test excluded msgids with an exclude file + 'excluded.py': (f'--exclude-file={exclude_file}',), # == Test character escaping # Escape ascii and unicode: 'escapes.py': ('--escape', '--add-comments='), @@ -556,9 +808,16 @@ def update_POT_snapshots(): output_file.write_text(output, encoding='utf-8') +def update_PO_snapshots(): + messages = parse_po((DATA_DIR / 'general.po').read_bytes(), 'general.po') + data = json.dumps(messages, indent=4, ensure_ascii=False) + (DATA_DIR / 'general.json').write_text(data, encoding='utf-8') + + if __name__ == '__main__': # To regenerate POT files if len(sys.argv) > 1 and sys.argv[1] == '--snapshot-update': update_POT_snapshots() + update_PO_snapshots() sys.exit(0) unittest.main() diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst b/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst new file mode 100644 index 00000000000000..83cb6b4b450a34 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst @@ -0,0 +1 @@ +Fix the :option:`!--exclude-file` option in :program:`pygettext`. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index f0ee2ea386f18f..2bece8e796945b 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -125,7 +125,7 @@ Set width of output to columns. -x filename - --exclude-file=filename + --exclude-file=filename.po Specify a file that contains a list of strings that are not be extracted from the input files. Each string to be excluded must appear on a line by itself in the file. @@ -140,15 +140,19 @@ """ import ast +import codecs import getopt import glob import importlib.machinery import importlib.util import os +import re import sys import time import tokenize from dataclasses import dataclass, field +from email.parser import HeaderParser +from enum import StrEnum, auto from io import BytesIO from operator import itemgetter @@ -279,6 +283,272 @@ def getFilesForName(name): return [] +def _key_for(msgid, msgctxt=None): + if msgctxt is None: + return msgid + return (msgctxt, msgid) + + +class POSection(StrEnum): + COMMENT = 'comment' + CTXT = 'msgctxt' + ID = 'msgid' + PLURAL = 'msgid_plural' + STR = 'msgstr' + + +def parse_po(po, filename): + """Parse a PO file.""" + if po.startswith(codecs.BOM_UTF8): + raise ValueError( + f"The file {filename} starts with a UTF-8 BOM which is not " + "allowed in .po files.\nPlease save the file without a BOM " + "and try again.") + + @dataclass + class ParserState: + filename: str + lineno: int = 0 + # Start off assuming Latin-1, so everything decodes without failure, + # until we know the exact encoding + encoding: str = 'latin-1' + # Current section + section: POSection | None = None + # Current message data + msgid: str | None = None + msgid_plural: str | None = None + msgctxt: str | None = None + msgstr: str | list[str] | None = None + # All parsed messages + messages: dict = field(default_factory=dict) + + @property + def is_plural(self): + return self.msgid_plural is not None + + + state = ParserState(filename) + # Parse the PO file + for line in po.splitlines(): + state.lineno += 1 + + # Skip empty lines + if not line.strip(): + continue + + if line.startswith(b'#'): + parse_comment(state) + elif line.startswith(b'msgctxt'): + parse_msgctxt(state, line) + elif line.startswith(b'msgid_plural'): + parse_msgid_plural(state, line) + elif line.startswith(b'msgid'): + parse_msgid(state, line) + elif line.startswith(b'msgstr'): + parse_msgstr(state, line) + else: + # Line containing only a string without a keyword + # This will be appended to the previous section + parse_line(state, line) + + if state.section == POSection.CTXT: + raise ValueError(f'{filename}:{state.lineno}: ' + 'Missing msgid after msgctxt') + if state.section == POSection.ID: + raise ValueError(f'{filename}:{state.lineno}: ' + 'Missing msgstr after msgid') + elif state.section == POSection.STR: + # Add last entry + _add_message(state) + return list(state.messages.values()) + + +def parse_comment(state): + if state.section not in (None, POSection.COMMENT, POSection.STR): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Comment line not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + state.section = POSection.COMMENT + + +def parse_msgctxt(state, line): + if state.section not in (None, POSection.COMMENT, POSection.STR): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgctxt not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + line = line.decode(state.encoding).removeprefix('msgctxt') + state.msgctxt = parse_quoted_strings(state, line) + state.section = POSection.CTXT + + +def parse_msgid_plural(state, line): + if state.section is None: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgid_plural must be preceded by msgid') + if state.section != POSection.ID: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgid_plural not allowed after {state.section}') + + line = line.decode(state.encoding).removeprefix('msgid_plural') + state.msgid_plural = parse_quoted_strings(state, line) + state.section = POSection.PLURAL + + +def parse_msgid(state, line): + if state.section not in (None, POSection.COMMENT, + POSection.STR, POSection.CTXT): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgid not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + line = line.decode(state.encoding).removeprefix('msgid') + state.msgid = parse_quoted_strings(state, line) + state.section = POSection.ID + + +def parse_msgstr(state, line): + if state.section is None: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgstr must be preceded by msgid') + if state.section not in (POSection.STR, POSection.ID, POSection.PLURAL): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgstr not allowed after {state.section}') + + line = line.decode(state.encoding) + if match := re.match(r'^msgstr\[(\d+)\]', line): + # This is a plural msgstr, e.g. msgstr[0] + if not state.is_plural: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Missing msgid_plural section') + index = int(match.group(1)) + line = line.removeprefix(match.group()) + if state.msgstr is None: + state.msgstr = [] + next_plural_index = len(state.msgstr) + if index != next_plural_index: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Plural form has incorrect index, found ' + f"'{index}' but should be '{next_plural_index}'") + state.msgstr.append(parse_quoted_strings(state, line)) + else: + # This is a regular (non-plural) msgstr + if state.is_plural: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Indexed msgstr required after msgid_plural') + if state.section == POSection.STR: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgstr not allowed after msgstr') + line = line.removeprefix('msgstr') + state.msgstr = parse_quoted_strings(state, line) + state.section = POSection.STR + + +def parse_line(state, line): + line = parse_quoted_strings(state, line.decode(state.encoding)) + if state.section == POSection.CTXT: + state.msgctxt += line + elif state.section == POSection.PLURAL: + state.msgid_plural += line + elif state.section == POSection.ID: + state.msgid += line + elif state.section == POSection.STR: + if isinstance(state.msgstr, list): + # This belongs to the last msgstr[N] entry + state.msgstr[-1] += line + else: + state.msgstr += line + else: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Syntax error before:\n{line}') + + +def parse_quoted_strings(state, line): + """ + Parse a line containing one or more quoted PO strings separated + by whitespace. + + Example: "Hello, " "world!" -> 'Hello, world!' + """ + line = line.strip() + if not line: + return '' + + quoted_string = r'"([^"\\]|\\.)*"' + # One or more quoted strings, possibly separated by whitespace + quoted_strings = fr'^({quoted_string}\s*)+$' + + if not re.match(quoted_strings, line): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Syntax error: {line}') + + string = '' + for match in re.finditer(quoted_string, line): + part = match.group() + string += parse_quoted_string(state, part) + return string + + +def parse_quoted_string(state, string): + """Parse a single quoted PO string.""" + # Check if there are any disallowed escape sequences + # The allowed escape sequences are: + # - \n, \r, \t, \\, \", \a, \b, \f, \v + # - Octal escapes: \o, \oo, \ooo + # - Hex escapes: \xh, \xhh, ... + if match := re.search(r'\\[^"\\abfnrtvx0-7]|\\x[^0-9a-fA-F]', string): + escape = match.group() + raise ValueError(f'{state.filename}:{state.lineno}: ' + f"Invalid escape sequence: '{escape}'") + + try: + return ast.literal_eval(string) + except (ValueError, SyntaxError) as e: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f"Invalid syntax: {string}") from e + +def _add_message(state): + key = _key_for(state.msgid, state.msgctxt) + if key in state.messages: + # PO files don't allow duplicate entries + raise ValueError(f"{state.filename}:{state.lineno}: " + f"Duplicate entry: {key!r}") + state.messages[key] = {'msgctxt': state.msgctxt, + 'msgid': state.msgid, + 'msgid_plural': state.msgid_plural, + 'msgstr': state.msgstr} + if state.msgid == "": + # This is the header, see whether there is an encoding declaration + state.encoding = _get_encoding(state.msgstr) + # Reset the message data + state.msgctxt = None + state.msgid = None + state.msgid_plural = None + state.msgstr = None + + +def _get_encoding(msgstr): + """Get the encoding from the header msgstr, if provided.""" + p = HeaderParser() + charset = p.parsestr(msgstr).get_content_charset() + return charset or 'latin-1' + + +def get_msgids_from_exclude_file(filename): + with open(filename, 'rb') as f: + po = f.read() + + messages = parse_po(po, filename) + return {m['msgid'] for m in messages} + + # Key is the function name, value is a dictionary mapping argument positions to the # type of the argument. The type is one of 'msgid', 'msgid_plural', or 'msgctxt'. DEFAULTKEYWORDS = { @@ -533,7 +803,7 @@ def _add_message( if not comments: comments = [] - key = self._key_for(msgid, msgctxt) + key = _key_for(msgid, msgctxt) message = self.messages.get(key) if message: message.add_location( @@ -553,12 +823,6 @@ def _add_message( comments=comments, ) - @staticmethod - def _key_for(msgid, msgctxt=None): - if msgctxt is not None: - return (msgctxt, msgid) - return msgid - def _get_func_name(self, node): match node.func: case ast.Name(id=id): @@ -742,14 +1006,18 @@ class Options: # initialize list of strings to exclude if options.excludefilename: try: - with open(options.excludefilename) as fp: - options.toexclude = fp.readlines() + options.toexclude = get_msgids_from_exclude_file( + options.excludefilename) + except ValueError as e: + print(f'Invalid exclude file ({options.excludefilename}): {e}', + file=sys.stderr) + sys.exit(1) except IOError: print(f"Can't read --exclude-file: {options.excludefilename}", file=sys.stderr) sys.exit(1) else: - options.toexclude = [] + options.toexclude = set() # resolve args to module lists expanded = []