Skip to content

Commit 8b6c7c7

Browse files
authored
gh-120754: Refactor I/O modules to stash whole stat result rather than individual members (#123412)
Multiple places in the I/O stack optimize common cases by using the information from stat. Currently individual members are extracted from the stat and stored into the fileio struct. Refactor the code to store the whole stat struct instead. Parallels the changes to _io. The `stat` Python object doesn't allow changing members, so rather than modifying estimated_size, just clear the value.
1 parent 96f619f commit 8b6c7c7

File tree

2 files changed

+81
-46
lines changed

2 files changed

+81
-46
lines changed

Lib/_pyio.py

+24-20
Original file line numberDiff line numberDiff line change
@@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
242242
buffering = -1
243243
line_buffering = True
244244
if buffering < 0:
245-
buffering = DEFAULT_BUFFER_SIZE
246-
try:
247-
bs = os.fstat(raw.fileno()).st_blksize
248-
except (OSError, AttributeError):
249-
pass
250-
else:
251-
if bs > 1:
252-
buffering = bs
245+
buffering = raw._blksize
253246
if buffering < 0:
254247
raise ValueError("invalid buffering size")
255248
if buffering == 0:
@@ -1565,19 +1558,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None):
15651558
os.set_inheritable(fd, False)
15661559

15671560
self._closefd = closefd
1568-
fdfstat = os.fstat(fd)
1561+
self._stat_atopen = os.fstat(fd)
15691562
try:
1570-
if stat.S_ISDIR(fdfstat.st_mode):
1563+
if stat.S_ISDIR(self._stat_atopen.st_mode):
15711564
raise IsADirectoryError(errno.EISDIR,
15721565
os.strerror(errno.EISDIR), file)
15731566
except AttributeError:
15741567
# Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR
15751568
# don't exist.
15761569
pass
1577-
self._blksize = getattr(fdfstat, 'st_blksize', 0)
1578-
if self._blksize <= 1:
1579-
self._blksize = DEFAULT_BUFFER_SIZE
1580-
self._estimated_size = fdfstat.st_size
15811570

15821571
if _setmode:
15831572
# don't translate newlines (\r\n <=> \n)
@@ -1623,6 +1612,17 @@ def __repr__(self):
16231612
return ('<%s name=%r mode=%r closefd=%r>' %
16241613
(class_name, name, self.mode, self._closefd))
16251614

1615+
@property
1616+
def _blksize(self):
1617+
if self._stat_atopen is None:
1618+
return DEFAULT_BUFFER_SIZE
1619+
1620+
blksize = getattr(self._stat_atopen, "st_blksize", 0)
1621+
# WASI sets blsize to 0
1622+
if not blksize:
1623+
return DEFAULT_BUFFER_SIZE
1624+
return blksize
1625+
16261626
def _checkReadable(self):
16271627
if not self._readable:
16281628
raise UnsupportedOperation('File not open for reading')
@@ -1655,16 +1655,20 @@ def readall(self):
16551655
"""
16561656
self._checkClosed()
16571657
self._checkReadable()
1658-
if self._estimated_size <= 0:
1658+
if self._stat_atopen is None or self._stat_atopen.st_size <= 0:
16591659
bufsize = DEFAULT_BUFFER_SIZE
16601660
else:
1661-
bufsize = self._estimated_size + 1
1661+
# In order to detect end of file, need a read() of at least 1
1662+
# byte which returns size 0. Oversize the buffer by 1 byte so the
1663+
# I/O can be completed with two read() calls (one for all data, one
1664+
# for EOF) without needing to resize the buffer.
1665+
bufsize = self._stat_atopen.st_size + 1
16621666

1663-
if self._estimated_size > 65536:
1667+
if self._stat_atopen.st_size > 65536:
16641668
try:
16651669
pos = os.lseek(self._fd, 0, SEEK_CUR)
1666-
if self._estimated_size >= pos:
1667-
bufsize = self._estimated_size - pos + 1
1670+
if self._stat_atopen.st_size >= pos:
1671+
bufsize = self._stat_atopen.st_size - pos + 1
16681672
except OSError:
16691673
pass
16701674

@@ -1742,7 +1746,7 @@ def truncate(self, size=None):
17421746
if size is None:
17431747
size = self.tell()
17441748
os.ftruncate(self._fd, size)
1745-
self._estimated_size = size
1749+
self._stat_atopen = None
17461750
return size
17471751

17481752
def close(self):

Modules/_io/fileio.c

+57-26
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,13 @@ typedef struct {
7474
signed int seekable : 2; /* -1 means unknown */
7575
unsigned int closefd : 1;
7676
char finalizing;
77-
unsigned int blksize;
78-
Py_off_t estimated_size;
77+
/* Stat result which was grabbed at file open, useful for optimizing common
78+
File I/O patterns to be more efficient. This is only guidance / an
79+
estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU)
80+
issues / bugs. Both the underlying file descriptor and file may be
81+
modified outside of the fileio object / Python (ex. gh-90102, GH-121941,
82+
gh-109523). */
83+
struct _Py_stat_struct *stat_atopen;
7984
PyObject *weakreflist;
8085
PyObject *dict;
8186
} fileio;
@@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
199204
self->writable = 0;
200205
self->appending = 0;
201206
self->seekable = -1;
202-
self->blksize = 0;
203-
self->estimated_size = -1;
207+
self->stat_atopen = NULL;
204208
self->closefd = 1;
205209
self->weakreflist = NULL;
206210
}
@@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
256260
#elif !defined(MS_WINDOWS)
257261
int *atomic_flag_works = NULL;
258262
#endif
259-
struct _Py_stat_struct fdfstat;
260263
int fstat_result;
261264
int async_err = 0;
262265

@@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
454457
#endif
455458
}
456459

457-
self->blksize = DEFAULT_BUFFER_SIZE;
460+
self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1);
461+
if (self->stat_atopen == NULL) {
462+
PyErr_NoMemory();
463+
goto error;
464+
}
458465
Py_BEGIN_ALLOW_THREADS
459-
fstat_result = _Py_fstat_noraise(self->fd, &fdfstat);
466+
fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen);
460467
Py_END_ALLOW_THREADS
461468
if (fstat_result < 0) {
462469
/* Tolerate fstat() errors other than EBADF. See Issue #25717, where
@@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
471478
#endif
472479
goto error;
473480
}
481+
482+
PyMem_Free(self->stat_atopen);
483+
self->stat_atopen = NULL;
474484
}
475485
else {
476486
#if defined(S_ISDIR) && defined(EISDIR)
477487
/* On Unix, open will succeed for directories.
478488
In Python, there should be no file objects referring to
479489
directories, so we need a check. */
480-
if (S_ISDIR(fdfstat.st_mode)) {
490+
if (S_ISDIR(self->stat_atopen->st_mode)) {
481491
errno = EISDIR;
482492
PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj);
483493
goto error;
484494
}
485495
#endif /* defined(S_ISDIR) */
486-
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
487-
if (fdfstat.st_blksize > 1)
488-
self->blksize = fdfstat.st_blksize;
489-
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
490-
if (fdfstat.st_size < PY_SSIZE_T_MAX) {
491-
self->estimated_size = (Py_off_t)fdfstat.st_size;
492-
}
493496
}
494497

495498
#if defined(MS_WINDOWS) || defined(__CYGWIN__)
@@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
521524
internal_close(self);
522525
_PyErr_ChainExceptions1(exc);
523526
}
527+
if (self->stat_atopen != NULL) {
528+
PyMem_Free(self->stat_atopen);
529+
self->stat_atopen = NULL;
530+
}
524531

525532
done:
526533
#ifdef MS_WINDOWS
@@ -553,6 +560,10 @@ fileio_dealloc(fileio *self)
553560
if (_PyIOBase_finalize((PyObject *) self) < 0)
554561
return;
555562
_PyObject_GC_UNTRACK(self);
563+
if (self->stat_atopen != NULL) {
564+
PyMem_Free(self->stat_atopen);
565+
self->stat_atopen = NULL;
566+
}
556567
if (self->weakreflist != NULL)
557568
PyObject_ClearWeakRefs((PyObject *) self);
558569
(void)fileio_clear(self);
@@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self)
725736
return err_closed();
726737
}
727738

728-
end = self->estimated_size;
739+
if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) {
740+
end = (Py_off_t)self->stat_atopen->st_size;
741+
}
742+
else {
743+
end = -1;
744+
}
729745
if (end <= 0) {
730746
/* Use a default size and resize as needed. */
731747
bufsize = SMALLCHUNK;
732748
}
733749
else {
734-
/* This is probably a real file, so we try to allocate a
735-
buffer one byte larger than the rest of the file. If the
736-
calculation is right then we should get EOF without having
737-
to enlarge the buffer. */
750+
/* This is probably a real file. */
738751
if (end > _PY_READ_MAX - 1) {
739752
bufsize = _PY_READ_MAX;
740753
}
741754
else {
755+
/* In order to detect end of file, need a read() of at
756+
least 1 byte which returns size 0. Oversize the buffer
757+
by 1 byte so the I/O can be completed with two read()
758+
calls (one for all data, one for EOF) without needing
759+
to resize the buffer. */
742760
bufsize = (size_t)end + 1;
743761
}
744762

@@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj)
10941112
return NULL;
10951113
}
10961114

1097-
/* Sometimes a large file is truncated. While estimated_size is used as a
1098-
estimate, that it is much larger than the actual size can result in a
1099-
significant over allocation and sometimes a MemoryError / running out of
1100-
memory. */
1101-
self->estimated_size = pos;
1115+
/* Since the file was truncated, its size at open is no longer accurate
1116+
as an estimate. Clear out the stat result, and rely on dynamic resize
1117+
code if a readall is requested. */
1118+
if (self->stat_atopen != NULL) {
1119+
PyMem_Free(self->stat_atopen);
1120+
self->stat_atopen = NULL;
1121+
}
11021122

11031123
return posobj;
11041124
}
@@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure)
12291249
return PyUnicode_FromString(mode_string(self));
12301250
}
12311251

1252+
static PyObject *
1253+
get_blksize(fileio *self, void *closure)
1254+
{
1255+
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
1256+
if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) {
1257+
return PyLong_FromLong(self->stat_atopen->st_blksize);
1258+
}
1259+
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
1260+
return PyLong_FromLong(DEFAULT_BUFFER_SIZE);
1261+
}
1262+
12321263
static PyGetSetDef fileio_getsetlist[] = {
12331264
{"closed", (getter)get_closed, NULL, "True if the file is closed"},
12341265
{"closefd", (getter)get_closefd, NULL,
12351266
"True if the file descriptor will be closed by close()."},
12361267
{"mode", (getter)get_mode, NULL, "String giving the file mode"},
1268+
{"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"},
12371269
{NULL},
12381270
};
12391271

12401272
static PyMemberDef fileio_members[] = {
1241-
{"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0},
12421273
{"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0},
12431274
{"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY},
12441275
{"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY},

0 commit comments

Comments
 (0)