diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index 6ce42b4c09502f..96f42e3db16585 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -179,6 +179,7 @@ typedef struct PyConfig { int use_frozen_modules; int safe_path; int int_max_str_digits; + wchar_t *gc_stats_file; /* --- Path configuration inputs ------------ */ int pathconfig_warnings; diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index b3abe2030a03da..68efec829fb320 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -187,6 +187,7 @@ struct _gc_runtime_state { collections, and are awaiting to undergo a full collection for the first time. */ Py_ssize_t long_lived_pending; + int advanced_stats; }; diff --git a/Include/pystats.h b/Include/pystats.h index 25ed4bddc7240c..93de7a6435e4e3 100644 --- a/Include/pystats.h +++ b/Include/pystats.h @@ -80,7 +80,6 @@ typedef struct _stats { ObjectStats object_stats; } PyStats; - PyAPI_DATA(PyStats) _py_stats_struct; PyAPI_DATA(PyStats *) _py_stats; diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index 6630faa6f4471d..dc0ca52ac37318 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -31,6 +31,7 @@ #include "pycore_pyerrors.h" #include "pycore_pystate.h" // _PyThreadState_GET() #include "pydtrace.h" +#include "pystats.h" typedef struct _gc_runtime_state GCState; @@ -39,6 +40,17 @@ module gc [clinic start generated code]*/ /*[clinic end generated code: output=da39a3ee5e6b4b0d input=b5c9690ecc842d79]*/ +typedef struct _gc_stats { + size_t n_collections; + PyObject* generation_number; + PyObject* total_objects; + PyObject* uncollectable; + PyObject* collected_cycles; + PyObject* collection_time; +} PyGCStats; + +PyGCStats _pygc_stats_struct = { 0 }; + #ifdef Py_DEBUG # define GC_DEBUG @@ -135,6 +147,32 @@ get_gc_state(void) return &interp->gc; } +static inline int +is_main_interpreter(void) +{ + return (PyInterpreterState_Get() == PyInterpreterState_Main()); +} + +static int +_PyInitGCStats() { + if (!is_main_interpreter()) { + return 0; + } +#define INIT_FIELD(field) {\ + _pygc_stats_struct.field = PyList_New(0);\ + if (_pygc_stats_struct.field== NULL) {\ + return -1; \ + }\ + _PyObject_GC_UNTRACK(_pygc_stats_struct.field); } + + INIT_FIELD(generation_number); + INIT_FIELD(total_objects); + INIT_FIELD(uncollectable); + INIT_FIELD(collected_cycles); + INIT_FIELD(collection_time); +#undef INIT_FIELD + return 0; +} void _PyGC_InitState(GCState *gcstate) @@ -151,8 +189,15 @@ _PyGC_InitState(GCState *gcstate) }; gcstate->generation0 = GEN_HEAD(gcstate, 0); INIT_HEAD(gcstate->permanent_generation); - #undef INIT_HEAD + + char *s = Py_GETENV("PYTHONGCTHRESHOLD"); + if (s) { + long n = atol(s); + if (n > 0) { + gcstate->generations[0].threshold = n; + } + } } @@ -171,6 +216,13 @@ _PyGC_Init(PyInterpreterState *interp) return _PyStatus_NO_MEMORY(); } + if (_Py_GetConfig()->gc_stats_file) { + if(_PyInitGCStats()) { + Py_FatalError("Could not initialize GC stats"); + } + gcstate->advanced_stats = 1; + } + return _PyStatus_OK(); } @@ -1192,6 +1244,13 @@ gc_collect_main(PyThreadState *tstate, int generation, PyGC_Head *gc; _PyTime_t t1 = 0; /* initialize to prevent a compiler warning */ GCState *gcstate = &tstate->interp->gc; + _PyTime_t gc_t1 = 0; + _PyTime_t gc_t2 = 0; + Py_ssize_t t = 0; + + if (gcstate->advanced_stats) { + gc_t1 = _PyTime_GetPerfCounter(); + } // gc_collect_main() must not be called before _PyGC_Init // or after _PyGC_Fini() @@ -1226,6 +1285,10 @@ gc_collect_main(PyThreadState *tstate, int generation, old = young; validate_list(old, collecting_clear_unreachable_clear); + if (gcstate->advanced_stats) { + t = gc_list_size(young); + } + deduce_unreachable(young, &unreachable); untrack_tuples(young); @@ -1324,6 +1387,31 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyErr_WriteUnraisableMsg("in garbage collection", NULL); } } + if (gcstate->advanced_stats) { + gc_t2 = _PyTime_GetPerfCounter(); +#define ADD_ELEMENT(field, elem, converter_fn) \ + { \ + PyObject* item = converter_fn(elem); \ + if (!item) { \ + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); \ + } \ + if (item && PyList_Append(_pygc_stats_struct.field, item) >= 0) { \ + _PyErr_WriteUnraisableMsg("in garbage collection", NULL); \ + } \ + Py_XDECREF(item); \ + } \ + + if (_pygc_stats_struct.generation_number) { + _pygc_stats_struct.n_collections++; + ADD_ELEMENT(generation_number, generation, PyLong_FromLong); + ADD_ELEMENT(total_objects, t, PyLong_FromSsize_t); + ADD_ELEMENT(uncollectable, n, PyLong_FromSsize_t); + ADD_ELEMENT(collected_cycles, m, PyLong_FromSsize_t); + double d = _PyTime_AsSecondsDouble(gc_t2 - gc_t1); + ADD_ELEMENT(collection_time, d, PyFloat_FromDouble); + } +#undef ADD_ELEMENT + } /* Update stats */ if (n_collected) { @@ -2168,11 +2256,66 @@ gc_fini_untrack(PyGC_Head *list) } } +static void +print_stats(FILE *out) { +#define WRITE_ITEM(collection, index, type, converter_fn, str_format) { \ + PyObject* item = PyList_GET_ITEM(_pygc_stats_struct.collection, index); \ + if (!item) { \ + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); \ + } \ + type num = converter_fn(item); \ + if (!num && PyErr_Occurred()) { \ + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); \ + } \ + fprintf(out, str_format, num); } \ + + fprintf(out, "collection,generation_number,total_objects,uncollectable,collected_cycles,collection_time\n"); + for (size_t i = 0; i < _pygc_stats_struct.n_collections; i++) { + fprintf(out, "%zd", i); + WRITE_ITEM(generation_number, i, long, PyLong_AsLong, "%zd,"); + WRITE_ITEM(total_objects, i, long, PyLong_AsLong, "%zd,"); + WRITE_ITEM(uncollectable, i, long, PyLong_AsLong, "%zd,"); + WRITE_ITEM(collected_cycles, i, long, PyLong_AsLong, "%zd,"); + WRITE_ITEM(collection_time, i, double, PyFloat_AsDouble, "%f"); + fprintf(out, "\n"); + } +} + +void +_Py_PrintGCStats(int to_file) +{ + FILE *out = stderr; + wchar_t *filename = _Py_GetConfig()->gc_stats_file; + if (filename && wcslen(filename) != 0) { + // Open a file from the gc_stats_file (wchar) + out = _Py_wfopen(filename, L"wb"); + if (!out) { + _PyErr_WriteUnraisableMsg(" when writing gc stats", NULL); + return; + } + } + else { + fprintf(out, "GC stats:\n"); + } + print_stats(out); + if (out != stderr) { + fclose(out); + } + + Py_CLEAR(_pygc_stats_struct.generation_number); + Py_CLEAR(_pygc_stats_struct.total_objects); + Py_CLEAR(_pygc_stats_struct.uncollectable); + Py_CLEAR(_pygc_stats_struct.collected_cycles); + Py_CLEAR(_pygc_stats_struct.collection_time); +} void _PyGC_Fini(PyInterpreterState *interp) { GCState *gcstate = &interp->gc; + if (is_main_interpreter() && gcstate->advanced_stats) { + _Py_PrintGCStats(1); + } Py_CLEAR(gcstate->garbage); Py_CLEAR(gcstate->callbacks); diff --git a/Python/initconfig.c b/Python/initconfig.c index d7b2dc4a297425..83075aee52b0ee 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -129,7 +129,11 @@ The following implementation-specific options are available:\n\ \n\ -X int_max_str_digits=number: limit the size of int<->str conversions.\n\ This helps avoid denial of service attacks when parsing untrusted data.\n\ - The default is sys.int_info.default_max_str_digits. 0 disables." + The default is sys.int_info.default_max_str_digits. 0 disables.\n\ +\n\ +-X gc_stats_file=filename: activates advance statistics for the Python garbage collector\n\ + and writes the results to the specified file. If no file is provided (by writting\n\ + -X gc_stats_file), the results are written to stderr. The default value is \"off\"." #ifdef Py_STATS "\n\ @@ -176,6 +180,8 @@ static const char usage_envvars[] = " and end column offset) to every instruction in code objects. This is useful \n" " when smaller code objects and pyc files are desired as well as suppressing the \n" " extra visual location indicators when the interpreter displays tracebacks.\n" +"PYTHONGCSTATSFILE: If this variable is set, it enables advance statistics for the \n" +" cycle garbage collector and writes them in file specified by the environment variable.\n" "These variables have equivalent command-line parameters (see --help for details):\n" "PYTHONDEBUG : enable parser debug mode (-d)\n" "PYTHONDONTWRITEBYTECODE : don't write .pyc files (-B)\n" @@ -703,6 +709,7 @@ config_check_consistency(const PyConfig *config) assert(config->_is_python_build >= 0); assert(config->safe_path >= 0); assert(config->int_max_str_digits >= 0); + assert(config->gc_stats_file != NULL); // config->use_frozen_modules is initialized later // by _PyConfig_InitImportConfig(). return 1; @@ -799,6 +806,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->int_max_str_digits = -1; config->_is_python_build = 0; config->code_debug_ranges = 1; + config->gc_stats_file = NULL; } @@ -1026,6 +1034,7 @@ _PyConfig_Copy(PyConfig *config, const PyConfig *config2) COPY_WSTRLIST(orig_argv); COPY_ATTR(_is_python_build); COPY_ATTR(int_max_str_digits); + COPY_WSTR_ATTR(gc_stats_file); #undef COPY_ATTR #undef COPY_WSTR_ATTR @@ -1133,6 +1142,7 @@ _PyConfig_AsDict(const PyConfig *config) SET_ITEM_INT(safe_path); SET_ITEM_INT(_is_python_build); SET_ITEM_INT(int_max_str_digits); + SET_ITEM_WSTR(gc_stats_file); return dict; @@ -1426,6 +1436,7 @@ _PyConfig_FromDict(PyConfig *config, PyObject *dict) GET_UINT(safe_path); GET_UINT(_is_python_build); GET_INT(int_max_str_digits); + GET_WSTR_OPT(gc_stats_file); #undef CHECK_VALUE #undef GET_UINT @@ -1861,6 +1872,31 @@ config_init_pycache_prefix(PyConfig *config) "PYTHONPYCACHEPREFIX"); } +static PyStatus +config_init_gc_stats_file(PyConfig *config) +{ + assert(config->gc_stats_file == NULL); + + const wchar_t *xoption = config_get_xoption(config, L"gc_stats_file"); + if (xoption) { + const wchar_t *sep = wcschr(xoption, L'='); + if (sep && wcslen(sep) > 1) { + config->gc_stats_file = _PyMem_RawWcsdup(sep + 1); + if (config->gc_stats_file == NULL) { + return _PyStatus_NO_MEMORY(); + } + } + else { + config->gc_stats_file = L""; + } + return _PyStatus_OK(); + } + + return CONFIG_GET_ENV_DUP(config, &config->gc_stats_file, + L"PYTHONGCSTATSFILE", + "PYTHONGCSTATSFILE"); +} + static PyStatus config_read_complex_options(PyConfig *config) @@ -1910,6 +1946,14 @@ config_read_complex_options(PyConfig *config) return status; } } + + if (config->gc_stats_file == NULL) { + status = config_init_gc_stats_file(config); + if (_PyStatus_EXCEPTION(status)) { + return status; + } + } + return _PyStatus_OK(); } diff --git a/Tools/scripts/generate_gc_stats_plots.py b/Tools/scripts/generate_gc_stats_plots.py new file mode 100644 index 00000000000000..801e9f794f89fa --- /dev/null +++ b/Tools/scripts/generate_gc_stats_plots.py @@ -0,0 +1,205 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import argparse + + +def get_obj_percentage_data(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collected_cycles"] / vals["total_objects"].values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gen_time_data(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collection_time"].values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gen_time_data_per_obj(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = vals["collection_time"] / (1 + vals["collected_cycles"]).values + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gen_freq_per_us(df): + gen_data = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + item = ( + 1.0 + / (vals["collection_time"] / (1 + vals["collected_cycles"])).values + / 1.0e3 + ) + if item.size == 0: + item = np.array([0]) + gen_data.append(item) + return gen_data + + +def get_gc_summary(df): + gen_totals = [] + for generation in range(3): + vals = df[df["generation_number"] == generation] + obj_collected = vals["collected_cycles"].sum() + total_time = vals["collection_time"].sum() + total_objs = vals["total_objects"].sum() + gen_totals.append((obj_collected, total_time, total_objs)) + total_gc_time = df["collection_time"].sum() + return gen_totals, total_gc_time + + +def gen_plot(df, output_filename): + def violinplot_with_custom_formatting( + ax, data, yticks, ylim, xlabel, title, formatter + ): + violin = ax.violinplot( + data, + vert=False, + showmeans=True, + showmedians=False, + widths=1, + quantiles=[[0.1, 0.9]] * len(data), + ) + violin["cquantiles"].set_linestyle(":") + ax.set_yticks(np.arange(len(yticks)) + 1, yticks) + ax.set_ylim(ylim) + ax.tick_params(axis="x", bottom=True, top=True, labelbottom=True, labeltop=True) + ax.xaxis.set_major_formatter(formatter) + ax.grid() + ax.set_title(title) + + def barplot_with_custom_formatting(ax, data, title, ytitle, formatter): + ax.bar(names, data) + ax.set_ylabel(ytitle) + ax.set_title(title) + ax.yaxis.set_major_formatter(formatter) + ax.grid() + + # names = ["First generation", "Second generation", "Third generation"] + names = ["Gen-0", "Gen-1", "Gen-2"] + # fig, (ax1, ax2, ax3, ax4) = plt.subplots( + # 5, 1, figsize=(8, (2 + len(names) * 0.3) * 4), layout="constrained" + # ) + fig = plt.figure(figsize=(10, 10), layout="constrained") + + ax1_0 = plt.subplot(7, 3, 1) + ax1_1 = plt.subplot(7, 3, 2) + ax1_2 = plt.subplot(7, 3, 3) + ax2_2 = plt.subplot(7, 1, 3) + ax2_3 = plt.subplot(7, 1, 4) + ax2_4 = plt.subplot(7, 1, 5) + ax2_5 = plt.subplot(7, 1, 6) + + figsize = (8, (2 + len(names) * 0.3) * 4) + + running_time_data = [x.sum() for x in get_gen_time_data(df)] + formatter = lambda val, pos: f"{val*1000:.0f}" + barplot_with_custom_formatting( + ax1_0, running_time_data, "Total running time", "[ms]", formatter + ) + + object_number_data = [ + np.sum(df[df["generation_number"] == i]["collected_cycles"] / 1000.0) + for i in range(3) + ] + formatter = lambda val, pos: f"{val:.0f}" + barplot_with_custom_formatting( + ax1_1, object_number_data, "Total object collected", "[x1000 obj]", formatter + ) + + object_per_time = np.divide( + np.array(object_number_data), np.array(running_time_data) + ) + formatter = lambda val, pos: f"{val:.0f}" + barplot_with_custom_formatting( + ax1_2, object_number_data, "Collected per run time", "[obj/ms]", formatter + ) + + obj_percentage_data = get_obj_percentage_data(df) + formatter = lambda val, pos: f"{val*100:.0f}%" + violinplot_with_custom_formatting( + ax2_2, + obj_percentage_data, + names, + (0, len(names) + 1), + "efficiency", + "Generation efficiency", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{val*1e3:.0f}ms" + gen_time_data = get_gen_time_data(df) + violinplot_with_custom_formatting( + ax2_3, + gen_time_data, + names, + (0, len(names) + 1), + "time", + "Generation Time", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{val*1e3:.0f}ms" + gen_time_data_per_obj = get_gen_time_data_per_obj(df) + violinplot_with_custom_formatting( + ax2_4, + gen_time_data_per_obj, + names, + (0, len(names) + 1), + "time", + "Time per obj collected", + formatter=formatter, + ) + + formatter = lambda val, pos: f"{int(val)} obj/ms" + gen_freq_per_us = get_gen_freq_per_us(df) + violinplot_with_custom_formatting( + ax2_5, + gen_freq_per_us, + names, + (0, len(names) + 1), + "time", + "Objects collected per ms", + formatter=formatter, + ) + + plt.savefig(output_filename) + plt.close() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("file", help="The file to plot") + parser.add_argument("output_filename", help="The output filename") + args = parser.parse_args() + + df = pd.read_csv(args.file) + gen_plot(df, args.output_filename) + + gen_totals, total_gc_time = get_gc_summary(df) + for generation in range(3): + gen_data = gen_totals[generation] + print( + f"Generation {generation}: {gen_data[0]} objects collected, {gen_data[1]:.2f} s, {gen_data[2]} total objects" + ) + print("Total GC time: {:.2f} s".format(total_gc_time)) + + +if __name__ == "__main__": + main()