Skip to content

Commit b80edaf

Browse files
[3.12] gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) (#121205)
gh-121188: Sanitize invalid XML characters in regrtest (GH-121195) When creating the JUnit XML file, regrtest now escapes characters which are invalid in XML, such as the chr(27) control character used in ANSI escape sequences. (cherry picked from commit af8c3d7) Co-authored-by: Victor Stinner <[email protected]>
1 parent 99bc858 commit b80edaf

File tree

4 files changed

+91
-5
lines changed

4 files changed

+91
-5
lines changed

Lib/test/libregrtest/testresult.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import traceback
1010
import unittest
1111
from test import support
12+
from test.libregrtest.utils import sanitize_xml
1213

1314
class RegressionTestResult(unittest.TextTestResult):
1415
USE_XML = False
@@ -65,23 +66,24 @@ def _add_result(self, test, capture=False, **args):
6566
if capture:
6667
if self._stdout_buffer is not None:
6768
stdout = self._stdout_buffer.getvalue().rstrip()
68-
ET.SubElement(e, 'system-out').text = stdout
69+
ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
6970
if self._stderr_buffer is not None:
7071
stderr = self._stderr_buffer.getvalue().rstrip()
71-
ET.SubElement(e, 'system-err').text = stderr
72+
ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
7273

7374
for k, v in args.items():
7475
if not k or not v:
7576
continue
77+
7678
e2 = ET.SubElement(e, k)
7779
if hasattr(v, 'items'):
7880
for k2, v2 in v.items():
7981
if k2:
80-
e2.set(k2, str(v2))
82+
e2.set(k2, sanitize_xml(str(v2)))
8183
else:
82-
e2.text = str(v2)
84+
e2.text = sanitize_xml(str(v2))
8385
else:
84-
e2.text = str(v)
86+
e2.text = sanitize_xml(str(v))
8587

8688
@classmethod
8789
def __makeErrorDict(cls, err_type, err_value, err_tb):

Lib/test/libregrtest/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os.path
66
import platform
77
import random
8+
import re
89
import shlex
910
import signal
1011
import subprocess
@@ -710,3 +711,24 @@ def get_signal_name(exitcode):
710711
pass
711712

712713
return None
714+
715+
716+
ILLEGAL_XML_CHARS_RE = re.compile(
717+
'['
718+
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
719+
'\x00-\x08\x0B\x0C\x0E-\x1F'
720+
# Surrogate characters
721+
'\uD800-\uDFFF'
722+
# Special Unicode characters
723+
'\uFFFE'
724+
'\uFFFF'
725+
# Match multiple sequential invalid characters for better effiency
726+
']+')
727+
728+
def _sanitize_xml_replace(regs):
729+
text = regs[0]
730+
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
731+
for ch in text)
732+
733+
def sanitize_xml(text):
734+
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)

Lib/test/test_regrtest.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import tempfile
2222
import textwrap
2323
import unittest
24+
from xml.etree import ElementTree
25+
2426
from test import support
2527
from test.support import os_helper
2628
from test.libregrtest import cmdline
@@ -2221,6 +2223,44 @@ def test_pass(self):
22212223
self.check_executed_tests(output, testname, stats=1, parallel=True)
22222224
self.assertNotIn('SPAM SPAM SPAM', output)
22232225

2226+
def test_xml(self):
2227+
code = textwrap.dedent(r"""
2228+
import unittest
2229+
from test import support
2230+
2231+
class VerboseTests(unittest.TestCase):
2232+
def test_failed(self):
2233+
print("abc \x1b def")
2234+
self.fail()
2235+
""")
2236+
testname = self.create_test(code=code)
2237+
2238+
# Run sequentially
2239+
filename = os_helper.TESTFN
2240+
self.addCleanup(os_helper.unlink, filename)
2241+
2242+
output = self.run_tests(testname, "--junit-xml", filename,
2243+
exitcode=EXITCODE_BAD_TEST)
2244+
self.check_executed_tests(output, testname,
2245+
failed=testname,
2246+
stats=TestStats(1, 1, 0))
2247+
2248+
# Test generated XML
2249+
with open(filename, encoding="utf8") as fp:
2250+
content = fp.read()
2251+
2252+
testsuite = ElementTree.fromstring(content)
2253+
self.assertEqual(int(testsuite.get('tests')), 1)
2254+
self.assertEqual(int(testsuite.get('errors')), 0)
2255+
self.assertEqual(int(testsuite.get('failures')), 1)
2256+
2257+
testcase = testsuite[0][0]
2258+
self.assertEqual(testcase.get('status'), 'run')
2259+
self.assertEqual(testcase.get('result'), 'completed')
2260+
self.assertGreater(float(testcase.get('time')), 0)
2261+
for out in testcase.iter('system-out'):
2262+
self.assertEqual(out.text, r"abc \x1b def")
2263+
22242264

22252265
class TestUtils(unittest.TestCase):
22262266
def test_format_duration(self):
@@ -2403,6 +2443,25 @@ def id(self):
24032443
self.assertTrue(match_test(test_chdir))
24042444
self.assertFalse(match_test(test_copy))
24052445

2446+
def test_sanitize_xml(self):
2447+
sanitize_xml = utils.sanitize_xml
2448+
2449+
# escape invalid XML characters
2450+
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
2451+
r'abc \x1b\x1f def')
2452+
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
2453+
r'nul:\x00, bell:\x07')
2454+
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
2455+
r'surrogate:\udc80')
2456+
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
2457+
r'illegal \ufffe and \uffff')
2458+
2459+
# no escape for valid XML characters
2460+
self.assertEqual(sanitize_xml('a\n\tb'),
2461+
'a\n\tb')
2462+
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
2463+
'valid t\xe9xt \u20ac')
2464+
24062465

24072466
if __name__ == '__main__':
24082467
unittest.main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
When creating the JUnit XML file, regrtest now escapes characters which are
2+
invalid in XML, such as the chr(27) control character used in ANSI escape
3+
sequences. Patch by Victor Stinner.

0 commit comments

Comments
 (0)