From 36e9061a5627b3cf66c01764f2c60635bac1a45d Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Tue, 9 Jul 2024 22:00:36 -0700 Subject: [PATCH 1/4] GH-120754: Remove isatty call during regular read For POSIX, TTYs are never regular files, so if the interpreter knows the file is regular it doesn't need to do an additional system call to check if the file is a TTY. The `open()` Python builtin requires a `stat` call at present in order to ensure the file being opened isn't a directory. That result includes the file mode which tells us if it is a regular file. There are a number of attributes from the stat which are stashed one off currently, move to stashing the whole object rather than just individual members. The stat object is reasonably large and currently the `stat_result.st_size` member cannot be modified from Python, which is needed by the `_pyio` implementation, so make the whole stat object optional. In the `_io` implementation this makes handling a stat failure simpler. At present there is no explicit user call to clear it, but if one is needed (ex. a program which has a lot of open FileIO objects and the memory becomes a problem) it would be straightforward to add. Ideally would be able to automatically clear (the values are generally used during I/O object initialization and not after. After a `write` they are no longer useful in current cases). It is fairly common pattern to scan a directory, look at the `stat` results (ex. is this file changed), and then open/read the file. In this PR I didn't update open's API to allow passing in a stat result to use, but that could be beneficial for some cases (ex. `importlib`). With this change on my Linux machine reading a small plain text file is down to 6 system calls. ```python openat(AT_FDCWD, "read_one.py", O_RDONLY|O_CLOEXEC) = 3 fstat(3, {st_mode=S_IFREG|0644, st_size=87, ...}) = 0 lseek(3, 0, SEEK_CUR) = 0 read(3, "from pathlib import Path\n\npath ="..., 88) = 87 read(3, "", 1) = 0 close(3) = 0 ``` --- Lib/_pyio.py | 48 +++++++------ ...-07-10-01-55-33.gh-issue-120754.KSWAmz.rst | 2 + Modules/_io/fileio.c | 67 ++++++++++++++----- 3 files changed, 78 insertions(+), 39 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-07-10-01-55-33.gh-issue-120754.KSWAmz.rst diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 75b5ad1b1a47d2..1bb6292f113a08 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -64,6 +64,15 @@ def text_encoding(encoding, stacklevel=2): return encoding +def _shortcut_isatty(raw): + """Regular files are never TTYs. In cases where we know we have a stat + result, use it to shortcut / skip a call to isatty.""" + if raw._stat_atopen and stat.S_ISREG(raw._stat_atopen.st_mode): + return False + + return raw.isatty() + + # Wrapper for builtins.open # # Trick so that open() won't become a bound method when stored @@ -238,18 +247,12 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, result = raw try: line_buffering = False - if buffering == 1 or buffering < 0 and raw.isatty(): + + if buffering == 1 or buffering < 0 and _shortcut_isatty(raw): buffering = -1 line_buffering = True if buffering < 0: - buffering = DEFAULT_BUFFER_SIZE - try: - bs = os.fstat(raw.fileno()).st_blksize - except (OSError, AttributeError): - pass - else: - if bs > 1: - buffering = bs + buffering = raw._blksize if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: @@ -1565,19 +1568,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None): os.set_inheritable(fd, False) self._closefd = closefd - fdfstat = os.fstat(fd) + self._stat_atopen = os.fstat(fd) try: - if stat.S_ISDIR(fdfstat.st_mode): + if stat.S_ISDIR(self._stat_atopen.st_mode): raise IsADirectoryError(errno.EISDIR, os.strerror(errno.EISDIR), file) except AttributeError: # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR # don't exist. pass - self._blksize = getattr(fdfstat, 'st_blksize', 0) - if self._blksize <= 1: - self._blksize = DEFAULT_BUFFER_SIZE - self._estimated_size = fdfstat.st_size if _setmode: # don't translate newlines (\r\n <=> \n) @@ -1623,6 +1622,13 @@ def __repr__(self): return ('<%s name=%r mode=%r closefd=%r>' % (class_name, name, self.mode, self._closefd)) + @property + def _blksize(self): + if self._stat_atopen: + return getattr(self._stat_atopen, "st_blksize", DEFAULT_BUFFER_SIZE) + + return DEFAULT_BUFFER_SIZE + def _checkReadable(self): if not self._readable: raise UnsupportedOperation('File not open for reading') @@ -1655,16 +1661,16 @@ def readall(self): """ self._checkClosed() self._checkReadable() - if self._estimated_size <= 0: + if not self._stat_atopen or self._stat_atopen.st_size <= 0: bufsize = DEFAULT_BUFFER_SIZE else: - bufsize = self._estimated_size + 1 + bufsize = self._stat_atopen.st_size + 1 - if self._estimated_size > 65536: + if self._stat_atopen.st_size > 65536: try: pos = os.lseek(self._fd, 0, SEEK_CUR) - if self._estimated_size >= pos: - bufsize = self._estimated_size - pos + 1 + if self._stat_atopen.st_size >= pos: + bufsize = self._stat_atopen.st_size - pos + 1 except OSError: pass @@ -1742,7 +1748,7 @@ def truncate(self, size=None): if size is None: size = self.tell() os.ftruncate(self._fd, size) - self._estimated_size = size + self._stat_atopen = None return size def close(self): diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-07-10-01-55-33.gh-issue-120754.KSWAmz.rst b/Misc/NEWS.d/next/Core and Builtins/2024-07-10-01-55-33.gh-issue-120754.KSWAmz.rst new file mode 100644 index 00000000000000..d8a13db3849eff --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-07-10-01-55-33.gh-issue-120754.KSWAmz.rst @@ -0,0 +1,2 @@ +Skip the ``isatty`` system call on just-opened regular files. This provides a +slight performance improvement when reading whole files. diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 5d9d87d6118a75..3612a93ec7a78c 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -74,8 +74,7 @@ typedef struct { signed int seekable : 2; /* -1 means unknown */ unsigned int closefd : 1; char finalizing; - unsigned int blksize; - Py_off_t estimated_size; + struct _Py_stat_struct *stat_atopen; PyObject *weakreflist; PyObject *dict; } fileio; @@ -199,8 +198,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds) self->writable = 0; self->appending = 0; self->seekable = -1; - self->blksize = 0; - self->estimated_size = -1; + self->stat_atopen = NULL; self->closefd = 1; self->weakreflist = NULL; } @@ -256,7 +254,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #elif !defined(MS_WINDOWS) int *atomic_flag_works = NULL; #endif - struct _Py_stat_struct fdfstat; int fstat_result; int async_err = 0; @@ -454,9 +451,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif } - self->blksize = DEFAULT_BUFFER_SIZE; + self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1); + if (self->stat_atopen == NULL) { + PyErr_Format(PyExc_MemoryError, "Unable to allocate space for stat result"); + goto error; + } Py_BEGIN_ALLOW_THREADS - fstat_result = _Py_fstat_noraise(self->fd, &fdfstat); + fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen); Py_END_ALLOW_THREADS if (fstat_result < 0) { /* Tolerate fstat() errors other than EBADF. See Issue #25717, where @@ -471,25 +472,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, #endif goto error; } + + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; } else { #if defined(S_ISDIR) && defined(EISDIR) /* On Unix, open will succeed for directories. In Python, there should be no file objects referring to directories, so we need a check. */ - if (S_ISDIR(fdfstat.st_mode)) { + if (S_ISDIR(self->stat_atopen->st_mode)) { errno = EISDIR; PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj); goto error; } #endif /* defined(S_ISDIR) */ -#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE - if (fdfstat.st_blksize > 1) - self->blksize = fdfstat.st_blksize; -#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ - if (fdfstat.st_size < PY_SSIZE_T_MAX) { - self->estimated_size = (Py_off_t)fdfstat.st_size; - } } #if defined(MS_WINDOWS) || defined(__CYGWIN__) @@ -521,6 +518,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode, internal_close(self); _PyErr_ChainExceptions1(exc); } + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } done: #ifdef MS_WINDOWS @@ -553,6 +554,10 @@ fileio_dealloc(fileio *self) if (_PyIOBase_finalize((PyObject *) self) < 0) return; _PyObject_GC_UNTRACK(self); + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } if (self->weakreflist != NULL) PyObject_ClearWeakRefs((PyObject *) self); (void)fileio_clear(self); @@ -725,7 +730,13 @@ _io_FileIO_readall_impl(fileio *self) return err_closed(); } - end = self->estimated_size; + if (self->stat_atopen != NULL) { + end = (Py_off_t)Py_MIN(self->stat_atopen->st_size, PY_SSIZE_T_MAX); + } + else { + end = -1; + } + if (end <= 0) { /* Use a default size and resize as needed. */ bufsize = SMALLCHUNK; @@ -1098,7 +1109,10 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj) estimate, that it is much larger than the actual size can result in a significant over allocation and sometimes a MemoryError / running out of memory. */ - self->estimated_size = pos; + if (self->stat_atopen != NULL) { + PyMem_Free(self->stat_atopen); + self->stat_atopen = NULL; + } return posobj; } @@ -1179,6 +1193,11 @@ _io_FileIO_isatty_impl(fileio *self) if (self->fd < 0) return err_closed(); + + if (self->stat_atopen != NULL && S_ISREG(self->stat_atopen->st_mode)) { + return Py_False; + } + Py_BEGIN_ALLOW_THREADS _Py_BEGIN_SUPPRESS_IPH res = isatty(self->fd); @@ -1229,16 +1248,28 @@ get_mode(fileio *self, void *closure) return PyUnicode_FromString(mode_string(self)); } +static PyObject * +get_blksize(fileio *self, void *closure) +{ +#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE + if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) { + return PyLong_FromLong(self->stat_atopen->st_blksize); + } +#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */ + return PyLong_FromLong(DEFAULT_BUFFER_SIZE); +} + + static PyGetSetDef fileio_getsetlist[] = { {"closed", (getter)get_closed, NULL, "True if the file is closed"}, {"closefd", (getter)get_closefd, NULL, "True if the file descriptor will be closed by close()."}, {"mode", (getter)get_mode, NULL, "String giving the file mode"}, + {"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"}, {NULL}, }; static PyMemberDef fileio_members[] = { - {"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0}, {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0}, {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY}, {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY}, From a51e8eeaaa891dc7fbb80a12b34e7f86cbbefb65 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 10 Jul 2024 13:26:07 -0700 Subject: [PATCH 2/4] WASI sets st_blksize to 0, update _pyio to handle. _io already handled --- Lib/_pyio.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 1bb6292f113a08..90a20755139c03 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1625,9 +1625,10 @@ def __repr__(self): @property def _blksize(self): if self._stat_atopen: - return getattr(self._stat_atopen, "st_blksize", DEFAULT_BUFFER_SIZE) + res = getattr(self._stat_atopen, "st_blksize", 0) - return DEFAULT_BUFFER_SIZE + # WASI sets blksize to 0 + return res if res > 0 else DEFAULT_BUFFER_SIZE def _checkReadable(self): if not self._readable: From 991fb010c573800a39e452418b2ca050343dae27 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 10 Jul 2024 16:03:46 -0700 Subject: [PATCH 3/4] Read must be smaller than _PY_READ_MAX --- Modules/_io/fileio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 3612a93ec7a78c..0cc15f81163b3d 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -731,7 +731,7 @@ _io_FileIO_readall_impl(fileio *self) } if (self->stat_atopen != NULL) { - end = (Py_off_t)Py_MIN(self->stat_atopen->st_size, PY_SSIZE_T_MAX); + end = (Py_off_t)Py_MIN(self->stat_atopen->st_size, _PY_READ_MAX); } else { end = -1; From a487c6bb7d3e8237f23fa7190fdd056f11ec8456 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 10 Jul 2024 23:39:06 -0700 Subject: [PATCH 4/4] Zipimport test relies on having a size > PY_READ_MAX meaning auto-expanding buffer --- Modules/_io/fileio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 0cc15f81163b3d..a141662a496133 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -730,8 +730,8 @@ _io_FileIO_readall_impl(fileio *self) return err_closed(); } - if (self->stat_atopen != NULL) { - end = (Py_off_t)Py_MIN(self->stat_atopen->st_size, _PY_READ_MAX); + if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) { + end = (Py_off_t)self->stat_atopen->st_size; } else { end = -1; @@ -775,7 +775,6 @@ _io_FileIO_readall_impl(fileio *self) } } - result = PyBytes_FromStringAndSize(NULL, bufsize); if (result == NULL) return NULL;