Skip to content

Commit 9ad57cf

Browse files
authored
gh-120754: Add a strace helper and test set of syscalls for open().read(), Take 2 (#123413)
1 parent 7d7d56d commit 9ad57cf

File tree

4 files changed

+329
-33
lines changed

4 files changed

+329
-33
lines changed

Lib/test/support/strace_helper.py

+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import re
2+
import sys
3+
import textwrap
4+
import unittest
5+
from dataclasses import dataclass
6+
from functools import cache
7+
from test import support
8+
from test.support.script_helper import run_python_until_end
9+
10+
_strace_binary = "/usr/bin/strace"
11+
_syscall_regex = re.compile(
12+
r"(?P<syscall>[^(]*)\((?P<args>[^)]*)\)\s*[=]\s*(?P<returncode>.+)")
13+
_returncode_regex = re.compile(
14+
br"\+\+\+ exited with (?P<returncode>\d+) \+\+\+")
15+
16+
17+
@dataclass
18+
class StraceEvent:
19+
syscall: str
20+
args: list[str]
21+
returncode: str
22+
23+
24+
@dataclass
25+
class StraceResult:
26+
strace_returncode: int
27+
python_returncode: int
28+
29+
"""The event messages generated by strace. This is very similar to the
30+
stderr strace produces with returncode marker section removed."""
31+
event_bytes: bytes
32+
stdout: bytes
33+
stderr: bytes
34+
35+
def events(self):
36+
"""Parse event_bytes data into system calls for easier processing.
37+
38+
This assumes the program under inspection doesn't print any non-utf8
39+
strings which would mix into the strace output."""
40+
decoded_events = self.event_bytes.decode('utf-8')
41+
matches = [
42+
_syscall_regex.match(event)
43+
for event in decoded_events.splitlines()
44+
]
45+
return [
46+
StraceEvent(match["syscall"],
47+
[arg.strip() for arg in (match["args"].split(","))],
48+
match["returncode"]) for match in matches if match
49+
]
50+
51+
def sections(self):
52+
"""Find all "MARK <X>" writes and use them to make groups of events.
53+
54+
This is useful to avoid variable / overhead events, like those at
55+
interpreter startup or when opening a file so a test can verify just
56+
the small case under study."""
57+
current_section = "__startup"
58+
sections = {current_section: []}
59+
for event in self.events():
60+
if event.syscall == 'write' and len(
61+
event.args) > 2 and event.args[1].startswith("\"MARK "):
62+
# Found a new section, don't include the write in the section
63+
# but all events until next mark should be in that section
64+
current_section = event.args[1].split(
65+
" ", 1)[1].removesuffix('\\n"')
66+
if current_section not in sections:
67+
sections[current_section] = list()
68+
else:
69+
sections[current_section].append(event)
70+
71+
return sections
72+
73+
74+
@support.requires_subprocess()
75+
def strace_python(code, strace_flags, check=True):
76+
"""Run strace and return the trace.
77+
78+
Sets strace_returncode and python_returncode to `-1` on error."""
79+
res = None
80+
81+
def _make_error(reason, details):
82+
return StraceResult(
83+
strace_returncode=-1,
84+
python_returncode=-1,
85+
event_bytes=f"error({reason},details={details}) = -1".encode('utf-8'),
86+
stdout=res.out if res else b"",
87+
stderr=res.err if res else b"")
88+
89+
# Run strace, and get out the raw text
90+
try:
91+
res, cmd_line = run_python_until_end(
92+
"-c",
93+
textwrap.dedent(code),
94+
__run_using_command=[_strace_binary] + strace_flags)
95+
except OSError as err:
96+
return _make_error("Caught OSError", err)
97+
98+
if check and res.rc:
99+
res.fail(cmd_line)
100+
101+
# Get out program returncode
102+
stripped = res.err.strip()
103+
output = stripped.rsplit(b"\n", 1)
104+
if len(output) != 2:
105+
return _make_error("Expected strace events and exit code line",
106+
stripped[-50:])
107+
108+
returncode_match = _returncode_regex.match(output[1])
109+
if not returncode_match:
110+
return _make_error("Expected to find returncode in last line.",
111+
output[1][:50])
112+
113+
python_returncode = int(returncode_match["returncode"])
114+
if check and python_returncode:
115+
res.fail(cmd_line)
116+
117+
return StraceResult(strace_returncode=res.rc,
118+
python_returncode=python_returncode,
119+
event_bytes=output[0],
120+
stdout=res.out,
121+
stderr=res.err)
122+
123+
124+
def get_events(code, strace_flags, prelude, cleanup):
125+
# NOTE: The flush is currently required to prevent the prints from getting
126+
# buffered and done all at once at exit
127+
prelude = textwrap.dedent(prelude)
128+
code = textwrap.dedent(code)
129+
cleanup = textwrap.dedent(cleanup)
130+
to_run = f"""
131+
print("MARK prelude", flush=True)
132+
{prelude}
133+
print("MARK code", flush=True)
134+
{code}
135+
print("MARK cleanup", flush=True)
136+
{cleanup}
137+
print("MARK __shutdown", flush=True)
138+
"""
139+
trace = strace_python(to_run, strace_flags)
140+
all_sections = trace.sections()
141+
return all_sections['code']
142+
143+
144+
def get_syscalls(code, strace_flags, prelude="", cleanup=""):
145+
"""Get the syscalls which a given chunk of python code generates"""
146+
events = get_events(code, strace_flags, prelude=prelude, cleanup=cleanup)
147+
return [ev.syscall for ev in events]
148+
149+
150+
# Moderately expensive (spawns a subprocess), so share results when possible.
151+
@cache
152+
def _can_strace():
153+
res = strace_python("import sys; sys.exit(0)", [], check=False)
154+
assert res.events(), "Should have parsed multiple calls"
155+
156+
return res.strace_returncode == 0 and res.python_returncode == 0
157+
158+
159+
def requires_strace():
160+
if sys.platform != "linux":
161+
return unittest.skip("Linux only, requires strace.")
162+
163+
if support.check_sanitizer(address=True, memory=True):
164+
return unittest.skip("LeakSanitizer does not work under ptrace (strace, gdb, etc)")
165+
166+
return unittest.skipUnless(_can_strace(), "Requires working strace")
167+
168+
169+
__all__ = ["get_events", "get_syscalls", "requires_strace", "strace_python",
170+
"StraceEvent", "StraceResult"]

Lib/test/test_fileio.py

+137-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from test.support import (
1313
cpython_only, swap_attr, gc_collect, is_emscripten, is_wasi,
14-
infinite_recursion,
14+
infinite_recursion, strace_helper
1515
)
1616
from test.support.os_helper import (
1717
TESTFN, TESTFN_ASCII, TESTFN_UNICODE, make_bad_fd,
@@ -24,6 +24,9 @@
2424
import _pyio # Python implementation of io
2525

2626

27+
_strace_flags=["--trace=%file,%desc"]
28+
29+
2730
class AutoFileTests:
2831
# file tests for which a test file is automatically set up
2932

@@ -359,6 +362,139 @@ def testErrnoOnClosedReadinto(self, f):
359362
a = array('b', b'x'*10)
360363
f.readinto(a)
361364

365+
@strace_helper.requires_strace()
366+
def test_syscalls_read(self):
367+
"""Check that the set of system calls produced by the I/O stack is what
368+
is expected for various read cases.
369+
370+
It's expected as bits of the I/O implementation change, this will need
371+
to change. The goal is to catch changes that unintentionally add
372+
additional systemcalls (ex. additional calls have been looked at in
373+
bpo-21679 and gh-120754).
374+
"""
375+
self.f.write(b"Hello, World!")
376+
self.f.close()
377+
378+
379+
def check_readall(name, code, prelude="", cleanup="",
380+
extra_checks=None):
381+
with self.subTest(name=name):
382+
syscalls = strace_helper.get_events(code, _strace_flags,
383+
prelude=prelude,
384+
cleanup=cleanup)
385+
386+
# The first call should be an open that returns a
387+
# file descriptor (fd). Afer that calls may vary. Once the file
388+
# is opened, check calls refer to it by fd as the filename
389+
# could be removed from the filesystem, renamed, etc. See:
390+
# Time-of-check time-of-use (TOCTOU) software bug class.
391+
#
392+
# There are a number of related but distinct open system calls
393+
# so not checking precise name here.
394+
self.assertGreater(
395+
len(syscalls),
396+
1,
397+
f"Should have had at least an open call|calls={syscalls}")
398+
fd_str = syscalls[0].returncode
399+
400+
# All other calls should contain the fd in their argument set.
401+
for ev in syscalls[1:]:
402+
self.assertIn(
403+
fd_str,
404+
ev.args,
405+
f"Looking for file descriptor in arguments|ev={ev}"
406+
)
407+
408+
# There are a number of related syscalls used to implement
409+
# behaviors in a libc (ex. fstat, newfstatat, statx, open, openat).
410+
# Allow any that use the same substring.
411+
def count_similarname(name):
412+
return len([ev for ev in syscalls if name in ev.syscall])
413+
414+
checks = [
415+
# Should open and close the file exactly once
416+
("open", 1),
417+
("close", 1),
418+
# There should no longer be an isatty call (All files being
419+
# tested are block devices / not character devices).
420+
('ioctl', 0),
421+
# Should only have one fstat (bpo-21679, gh-120754)
422+
# note: It's important this uses a fd rather than filename,
423+
# That is validated by the `fd` check above.
424+
# note: fstat, newfstatat, and statx have all been observed
425+
# here in the underlying C library implementations.
426+
("stat", 1)
427+
]
428+
429+
if extra_checks:
430+
checks += extra_checks
431+
432+
for call, count in checks:
433+
self.assertEqual(
434+
count_similarname(call),
435+
count,
436+
msg=f"call={call}|count={count}|syscalls={syscalls}"
437+
)
438+
439+
# "open, read, close" file using different common patterns.
440+
check_readall(
441+
"open builtin with default options",
442+
f"""
443+
f = open('{TESTFN}')
444+
f.read()
445+
f.close()
446+
"""
447+
)
448+
449+
check_readall(
450+
"open in binary mode",
451+
f"""
452+
f = open('{TESTFN}', 'rb')
453+
f.read()
454+
f.close()
455+
"""
456+
)
457+
458+
check_readall(
459+
"open in text mode",
460+
f"""
461+
f = open('{TESTFN}', 'rt')
462+
f.read()
463+
f.close()
464+
""",
465+
# GH-122111: read_text uses BufferedIO which requires looking up
466+
# position in file. `read_bytes` disables that buffering and avoids
467+
# these calls which is tested the `pathlib read_bytes` case.
468+
extra_checks=[("seek", 1)]
469+
)
470+
471+
check_readall(
472+
"pathlib read_bytes",
473+
"p.read_bytes()",
474+
prelude=f"""from pathlib import Path; p = Path("{TESTFN}")""",
475+
# GH-122111: Buffering is disabled so these calls are avoided.
476+
extra_checks=[("seek", 0)]
477+
)
478+
479+
check_readall(
480+
"pathlib read_text",
481+
"p.read_text()",
482+
prelude=f"""from pathlib import Path; p = Path("{TESTFN}")"""
483+
)
484+
485+
# Focus on just `read()`.
486+
calls = strace_helper.get_syscalls(
487+
prelude=f"f = open('{TESTFN}')",
488+
code="f.read()",
489+
cleanup="f.close()",
490+
strace_flags=_strace_flags
491+
)
492+
# One to read all the bytes
493+
# One to read the EOF and get a size 0 return.
494+
self.assertEqual(calls.count("read"), 2)
495+
496+
497+
362498
class CAutoFileTests(AutoFileTests, unittest.TestCase):
363499
FileIO = _io.FileIO
364500
modulename = '_io'

0 commit comments

Comments
 (0)