Skip to content

Commit 1f2f9c4

Browse files
[3.13] gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) (#121204)
gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) When creating the JUnit XML file, regrtest now escapes characters which are invalid in XML, such as the chr(27) control character used in ANSI escape sequences. (cherry picked from commit af8c3d7) Co-authored-by: Victor Stinner <[email protected]>
1 parent 82777cd commit 1f2f9c4

File tree

4 files changed

+91
-5
lines changed

4 files changed

+91
-5
lines changed

Lib/test/libregrtest/testresult.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import traceback
1010
import unittest
1111
from test import support
12+
from test.libregrtest.utils import sanitize_xml
1213

1314
class RegressionTestResult(unittest.TextTestResult):
1415
USE_XML = False
@@ -65,23 +66,24 @@ def _add_result(self, test, capture=False, **args):
6566
if capture:
6667
if self._stdout_buffer is not None:
6768
stdout = self._stdout_buffer.getvalue().rstrip()
68-
ET.SubElement(e, 'system-out').text = stdout
69+
ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
6970
if self._stderr_buffer is not None:
7071
stderr = self._stderr_buffer.getvalue().rstrip()
71-
ET.SubElement(e, 'system-err').text = stderr
72+
ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
7273

7374
for k, v in args.items():
7475
if not k or not v:
7576
continue
77+
7678
e2 = ET.SubElement(e, k)
7779
if hasattr(v, 'items'):
7880
for k2, v2 in v.items():
7981
if k2:
80-
e2.set(k2, str(v2))
82+
e2.set(k2, sanitize_xml(str(v2)))
8183
else:
82-
e2.text = str(v2)
84+
e2.text = sanitize_xml(str(v2))
8385
else:
84-
e2.text = str(v)
86+
e2.text = sanitize_xml(str(v))
8587

8688
@classmethod
8789
def __makeErrorDict(cls, err_type, err_value, err_tb):

Lib/test/libregrtest/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os.path
66
import platform
77
import random
8+
import re
89
import shlex
910
import signal
1011
import subprocess
@@ -712,3 +713,24 @@ def get_signal_name(exitcode):
712713
pass
713714

714715
return None
716+
717+
718+
ILLEGAL_XML_CHARS_RE = re.compile(
719+
'['
720+
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
721+
'\x00-\x08\x0B\x0C\x0E-\x1F'
722+
# Surrogate characters
723+
'\uD800-\uDFFF'
724+
# Special Unicode characters
725+
'\uFFFE'
726+
'\uFFFF'
727+
# Match multiple sequential invalid characters for better effiency
728+
']+')
729+
730+
def _sanitize_xml_replace(regs):
731+
text = regs[0]
732+
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
733+
for ch in text)
734+
735+
def sanitize_xml(text):
736+
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)

Lib/test/test_regrtest.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import tempfile
2222
import textwrap
2323
import unittest
24+
from xml.etree import ElementTree
25+
2426
from test import support
2527
from test.support import os_helper, without_optimizer
2628
from test.libregrtest import cmdline
@@ -2243,6 +2245,44 @@ def test_pass(self):
22432245
self.check_executed_tests(output, testname, stats=1, parallel=True)
22442246
self.assertNotIn('SPAM SPAM SPAM', output)
22452247

2248+
def test_xml(self):
2249+
code = textwrap.dedent(r"""
2250+
import unittest
2251+
from test import support
2252+
2253+
class VerboseTests(unittest.TestCase):
2254+
def test_failed(self):
2255+
print("abc \x1b def")
2256+
self.fail()
2257+
""")
2258+
testname = self.create_test(code=code)
2259+
2260+
# Run sequentially
2261+
filename = os_helper.TESTFN
2262+
self.addCleanup(os_helper.unlink, filename)
2263+
2264+
output = self.run_tests(testname, "--junit-xml", filename,
2265+
exitcode=EXITCODE_BAD_TEST)
2266+
self.check_executed_tests(output, testname,
2267+
failed=testname,
2268+
stats=TestStats(1, 1, 0))
2269+
2270+
# Test generated XML
2271+
with open(filename, encoding="utf8") as fp:
2272+
content = fp.read()
2273+
2274+
testsuite = ElementTree.fromstring(content)
2275+
self.assertEqual(int(testsuite.get('tests')), 1)
2276+
self.assertEqual(int(testsuite.get('errors')), 0)
2277+
self.assertEqual(int(testsuite.get('failures')), 1)
2278+
2279+
testcase = testsuite[0][0]
2280+
self.assertEqual(testcase.get('status'), 'run')
2281+
self.assertEqual(testcase.get('result'), 'completed')
2282+
self.assertGreater(float(testcase.get('time')), 0)
2283+
for out in testcase.iter('system-out'):
2284+
self.assertEqual(out.text, r"abc \x1b def")
2285+
22462286

22472287
class TestUtils(unittest.TestCase):
22482288
def test_format_duration(self):
@@ -2426,6 +2466,25 @@ def id(self):
24262466
self.assertTrue(match_test(test_chdir))
24272467
self.assertFalse(match_test(test_copy))
24282468

2469+
def test_sanitize_xml(self):
2470+
sanitize_xml = utils.sanitize_xml
2471+
2472+
# escape invalid XML characters
2473+
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
2474+
r'abc \x1b\x1f def')
2475+
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
2476+
r'nul:\x00, bell:\x07')
2477+
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
2478+
r'surrogate:\udc80')
2479+
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
2480+
r'illegal \ufffe and \uffff')
2481+
2482+
# no escape for valid XML characters
2483+
self.assertEqual(sanitize_xml('a\n\tb'),
2484+
'a\n\tb')
2485+
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
2486+
'valid t\xe9xt \u20ac')
2487+
24292488

24302489
if __name__ == '__main__':
24312490
unittest.main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
When creating the JUnit XML file, regrtest now escapes characters which are
2+
invalid in XML, such as the chr(27) control character used in ANSI escape
3+
sequences. Patch by Victor Stinner.

0 commit comments

Comments
 (0)