Skip to content

Commit af8c3d7

Browse files
authored
gh-121188: Sanitize invalid XML characters in regrtest (#121195)
When creating the JUnit XML file, regrtest now escapes characters which are invalid in XML, such as the chr(27) control character used in ANSI escape sequences.
1 parent f80376b commit af8c3d7

File tree

4 files changed

+91
-5
lines changed

4 files changed

+91
-5
lines changed

Lib/test/libregrtest/testresult.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import traceback
1010
import unittest
1111
from test import support
12+
from test.libregrtest.utils import sanitize_xml
1213

1314
class RegressionTestResult(unittest.TextTestResult):
1415
USE_XML = False
@@ -65,23 +66,24 @@ def _add_result(self, test, capture=False, **args):
6566
if capture:
6667
if self._stdout_buffer is not None:
6768
stdout = self._stdout_buffer.getvalue().rstrip()
68-
ET.SubElement(e, 'system-out').text = stdout
69+
ET.SubElement(e, 'system-out').text = sanitize_xml(stdout)
6970
if self._stderr_buffer is not None:
7071
stderr = self._stderr_buffer.getvalue().rstrip()
71-
ET.SubElement(e, 'system-err').text = stderr
72+
ET.SubElement(e, 'system-err').text = sanitize_xml(stderr)
7273

7374
for k, v in args.items():
7475
if not k or not v:
7576
continue
77+
7678
e2 = ET.SubElement(e, k)
7779
if hasattr(v, 'items'):
7880
for k2, v2 in v.items():
7981
if k2:
80-
e2.set(k2, str(v2))
82+
e2.set(k2, sanitize_xml(str(v2)))
8183
else:
82-
e2.text = str(v2)
84+
e2.text = sanitize_xml(str(v2))
8385
else:
84-
e2.text = str(v)
86+
e2.text = sanitize_xml(str(v))
8587

8688
@classmethod
8789
def __makeErrorDict(cls, err_type, err_value, err_tb):

Lib/test/libregrtest/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os.path
66
import platform
77
import random
8+
import re
89
import shlex
910
import signal
1011
import subprocess
@@ -712,3 +713,24 @@ def get_signal_name(exitcode):
712713
pass
713714

714715
return None
716+
717+
718+
ILLEGAL_XML_CHARS_RE = re.compile(
719+
'['
720+
# Control characters; newline (\x0A and \x0D) and TAB (\x09) are legal
721+
'\x00-\x08\x0B\x0C\x0E-\x1F'
722+
# Surrogate characters
723+
'\uD800-\uDFFF'
724+
# Special Unicode characters
725+
'\uFFFE'
726+
'\uFFFF'
727+
# Match multiple sequential invalid characters for better effiency
728+
']+')
729+
730+
def _sanitize_xml_replace(regs):
731+
text = regs[0]
732+
return ''.join(f'\\x{ord(ch):02x}' if ch <= '\xff' else ascii(ch)[1:-1]
733+
for ch in text)
734+
735+
def sanitize_xml(text):
736+
return ILLEGAL_XML_CHARS_RE.sub(_sanitize_xml_replace, text)

Lib/test/test_regrtest.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
import tempfile
2222
import textwrap
2323
import unittest
24+
from xml.etree import ElementTree
25+
2426
from test import support
2527
from test.support import import_helper
2628
from test.support import os_helper
@@ -2254,6 +2256,44 @@ def test_pass(self):
22542256
self.check_executed_tests(output, testname, stats=1, parallel=True)
22552257
self.assertNotIn('SPAM SPAM SPAM', output)
22562258

2259+
def test_xml(self):
2260+
code = textwrap.dedent(r"""
2261+
import unittest
2262+
from test import support
2263+
2264+
class VerboseTests(unittest.TestCase):
2265+
def test_failed(self):
2266+
print("abc \x1b def")
2267+
self.fail()
2268+
""")
2269+
testname = self.create_test(code=code)
2270+
2271+
# Run sequentially
2272+
filename = os_helper.TESTFN
2273+
self.addCleanup(os_helper.unlink, filename)
2274+
2275+
output = self.run_tests(testname, "--junit-xml", filename,
2276+
exitcode=EXITCODE_BAD_TEST)
2277+
self.check_executed_tests(output, testname,
2278+
failed=testname,
2279+
stats=TestStats(1, 1, 0))
2280+
2281+
# Test generated XML
2282+
with open(filename, encoding="utf8") as fp:
2283+
content = fp.read()
2284+
2285+
testsuite = ElementTree.fromstring(content)
2286+
self.assertEqual(int(testsuite.get('tests')), 1)
2287+
self.assertEqual(int(testsuite.get('errors')), 0)
2288+
self.assertEqual(int(testsuite.get('failures')), 1)
2289+
2290+
testcase = testsuite[0][0]
2291+
self.assertEqual(testcase.get('status'), 'run')
2292+
self.assertEqual(testcase.get('result'), 'completed')
2293+
self.assertGreater(float(testcase.get('time')), 0)
2294+
for out in testcase.iter('system-out'):
2295+
self.assertEqual(out.text, r"abc \x1b def")
2296+
22572297

22582298
class TestUtils(unittest.TestCase):
22592299
def test_format_duration(self):
@@ -2437,6 +2477,25 @@ def id(self):
24372477
self.assertTrue(match_test(test_chdir))
24382478
self.assertFalse(match_test(test_copy))
24392479

2480+
def test_sanitize_xml(self):
2481+
sanitize_xml = utils.sanitize_xml
2482+
2483+
# escape invalid XML characters
2484+
self.assertEqual(sanitize_xml('abc \x1b\x1f def'),
2485+
r'abc \x1b\x1f def')
2486+
self.assertEqual(sanitize_xml('nul:\x00, bell:\x07'),
2487+
r'nul:\x00, bell:\x07')
2488+
self.assertEqual(sanitize_xml('surrogate:\uDC80'),
2489+
r'surrogate:\udc80')
2490+
self.assertEqual(sanitize_xml('illegal \uFFFE and \uFFFF'),
2491+
r'illegal \ufffe and \uffff')
2492+
2493+
# no escape for valid XML characters
2494+
self.assertEqual(sanitize_xml('a\n\tb'),
2495+
'a\n\tb')
2496+
self.assertEqual(sanitize_xml('valid t\xe9xt \u20ac'),
2497+
'valid t\xe9xt \u20ac')
2498+
24402499

24412500
if __name__ == '__main__':
24422501
unittest.main()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
When creating the JUnit XML file, regrtest now escapes characters which are
2+
invalid in XML, such as the chr(27) control character used in ANSI escape
3+
sequences. Patch by Victor Stinner.

0 commit comments

Comments
 (0)