From d13b03732edd530ef0b6f621349c38c71dc2f7ac Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sat, 16 Apr 2022 11:40:29 +0800 Subject: [PATCH 1/5] 1. limit the maximum capturing group to 1,073,741,823 These types are kept as Py_ssize_t: - PatternObject.groups - MatchObject.lastindex - MatchObject.groups - On 32 bit platform: 36 bytes, no change. (msvc2022) - On 64 bit platform: 72 bytes -> 64 bytes. (msvc2022/gcc9.4) --- .../Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst | 2 ++ Modules/_sre/sre.h | 12 ++++++------ Modules/_sre/sre_lib.h | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst diff --git a/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst b/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst new file mode 100644 index 00000000000000..ac4c52bd7058a1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-16-11-39-59.bpo-47256.1cygyd.rst @@ -0,0 +1,2 @@ +:mod:`re` module, limit the maximum capturing group to 1,073,741,823 in +64-bit build, this increases the depth of backtracking. diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index 129f5595269f5b..aff064d343ec4c 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -18,10 +18,10 @@ #define SRE_CODE Py_UCS4 #if SIZEOF_SIZE_T > 4 # define SRE_MAXREPEAT (~(SRE_CODE)0) -# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2) +# define SRE_MAXGROUPS ((SRE_CODE)INT32_MAX / 2) #else # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) -# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2) +# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_VOID_P / 2) #endif typedef struct { @@ -73,12 +73,12 @@ typedef struct { Py_ssize_t pos, endpos; int isbytes; int charsize; /* character size */ - /* registers */ - Py_ssize_t lastindex; - Py_ssize_t lastmark; - const void** mark; int match_all; int must_advance; + /* marks */ + int lastmark; + int lastindex; + const void** mark; /* dynamically allocated stuff */ char* data_stack; size_t data_stack_size; diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 3472e65b87ae6f..0481e9d8cc3964 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -512,8 +512,8 @@ typedef struct { const SRE_CHAR* ptr; const SRE_CODE* pattern; Py_ssize_t count; - Py_ssize_t lastmark; - Py_ssize_t lastindex; + int lastmark; + int lastindex; union { SRE_CODE chr; SRE_REPEAT* rep; From 8a38ca75ede568a00b2c7a12c24c66e0fa349ae0 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sat, 16 Apr 2022 11:49:57 +0800 Subject: [PATCH 2/5] 2. further reduce sizeof(SRE(match_context)) change the type of `SRE(match_context).jump` from Py_ssize_t to int - On 32 bit platform: 36 bytes, no change. (msvc2022) - On 64 bit platform: 64 bytes -> 56 bytes. (msvc2022/gcc9.4) And make the order of `DO_JUMPX` macro and `SRE(match_context)` struct consistent. --- Modules/_sre/sre_lib.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 0481e9d8cc3964..069dfce609431d 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -488,10 +488,10 @@ do { \ ctx->pattern = pattern; \ ctx->ptr = ptr; \ DATA_ALLOC(SRE(match_context), nextctx); \ - nextctx->last_ctx_pos = ctx_pos; \ - nextctx->jump = jumpvalue; \ nextctx->pattern = nextpattern; \ nextctx->toplevel = toplevel_; \ + nextctx->jump = jumpvalue; \ + nextctx->last_ctx_pos = ctx_pos; \ pattern = nextpattern; \ ctx_pos = alloc_pos; \ ctx = nextctx; \ @@ -507,18 +507,18 @@ do { \ DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) typedef struct { - Py_ssize_t last_ctx_pos; - Py_ssize_t jump; - const SRE_CHAR* ptr; - const SRE_CODE* pattern; Py_ssize_t count; - int lastmark; - int lastindex; union { SRE_CODE chr; SRE_REPEAT* rep; } u; + int lastmark; + int lastindex; + const SRE_CODE* pattern; + const SRE_CHAR* ptr; int toplevel; + int jump; + Py_ssize_t last_ctx_pos; } SRE(match_context); #define MAYBE_CHECK_SIGNALS \ @@ -559,7 +559,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) const SRE_CHAR* end = (const SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t i, ret = 0; - Py_ssize_t jump; + int jump; unsigned int sigcount=0; SRE(match_context)* ctx; From 8cfcab9a76171a0063164ee15b0be3483be90bd2 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sat, 16 Apr 2022 11:56:10 +0800 Subject: [PATCH 3/5] 3. MARK_(PUSH|POP) macros compute marks size only once DATA_STACK_(PUSH|POP) macros use the size multiple time, now computed value is propagated. --- Modules/_sre/sre_lib.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 069dfce609431d..03e55b59ddefc8 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -450,20 +450,23 @@ do { \ #define MARK_PUSH(lastmark) \ do if (lastmark >= 0) { \ - i = lastmark; /* ctx->lastmark may change if reallocated */ \ - DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ + size_t _marks_size = (lastmark+1) * sizeof(void*); \ + DATA_STACK_PUSH(state, state->mark, _marks_size); \ } while (0) #define MARK_POP(lastmark) \ do if (lastmark >= 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ + size_t _marks_size = (lastmark+1) * sizeof(void*); \ + DATA_STACK_POP(state, state->mark, _marks_size, 1); \ } while (0) #define MARK_POP_KEEP(lastmark) \ do if (lastmark >= 0) { \ - DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ + size_t _marks_size = (lastmark+1) * sizeof(void*); \ + DATA_STACK_POP(state, state->mark, _marks_size, 0); \ } while (0) #define MARK_POP_DISCARD(lastmark) \ do if (lastmark >= 0) { \ - DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ + size_t _marks_size = (lastmark+1) * sizeof(void*); \ + DATA_STACK_POP_DISCARD(state, _marks_size); \ } while (0) #define JUMP_NONE 0 From 11c9ad3e1a7c06164410380705fa218e22cb8bb2 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Sat, 16 Apr 2022 12:00:34 +0800 Subject: [PATCH 4/5] 4. remove variable i --- Modules/_sre/sre_lib.h | 45 ++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 03e55b59ddefc8..dcf18982c65966 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -561,7 +561,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) { const SRE_CHAR* end = (const SRE_CHAR *)state->end; Py_ssize_t alloc_pos, ctx_pos = -1; - Py_ssize_t i, ret = 0; + Py_ssize_t ret = 0; int jump; unsigned int sigcount=0; @@ -610,20 +610,22 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* */ TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0])); - i = pattern[0]; - if (i & 1) - state->lastindex = i/2 + 1; - if (i > state->lastmark) { - /* state->lastmark is the highest valid index in the - state->mark array. If it is increased by more than 1, - the intervening marks must be set to NULL to signal - that these marks have not been encountered. */ - Py_ssize_t j = state->lastmark + 1; - while (j < i) - state->mark[j++] = NULL; - state->lastmark = i; + { + int i = pattern[0]; + if (i & 1) + state->lastindex = i/2 + 1; + if (i > state->lastmark) { + /* state->lastmark is the highest valid index in the + state->mark array. If it is increased by more than 1, + the intervening marks must be set to NULL to signal + that these marks have not been encountered. */ + int j = state->lastmark + 1; + while (j < i) + state->mark[j++] = NULL; + state->lastmark = i; + } + state->mark[i] = ptr; } - state->mark[i] = ptr; pattern++; DISPATCH; @@ -1376,9 +1378,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* match backreference */ TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0])); - i = pattern[0]; { - Py_ssize_t groupref = i+i; + int groupref = pattern[0] * 2; if (groupref >= state->lastmark) { RETURN_FAILURE; } else { @@ -1401,9 +1402,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* match backreference */ TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0])); - i = pattern[0]; { - Py_ssize_t groupref = i+i; + int groupref = pattern[0] * 2; if (groupref >= state->lastmark) { RETURN_FAILURE; } else { @@ -1427,9 +1427,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* match backreference */ TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern, ptr, pattern[0])); - i = pattern[0]; { - Py_ssize_t groupref = i+i; + int groupref = pattern[0] * 2; if (groupref >= state->lastmark) { RETURN_FAILURE; } else { @@ -1453,9 +1452,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* match backreference */ TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern, ptr, pattern[0])); - i = pattern[0]; { - Py_ssize_t groupref = i+i; + int groupref = pattern[0] * 2; if (groupref >= state->lastmark) { RETURN_FAILURE; } else { @@ -1479,9 +1477,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern, ptr, pattern[0])); /* codeyes codeno ... */ - i = pattern[0]; { - Py_ssize_t groupref = i+i; + int groupref = pattern[0] * 2; if (groupref >= state->lastmark) { pattern += pattern[1]; DISPATCH; From f811dfc22cddd61c8cbfb084e41020581556e432 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Mon, 18 Apr 2022 20:40:23 +0800 Subject: [PATCH 5/5] Fix indentation --- Modules/_sre/sre_lib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index dcf18982c65966..db624aa896d6a7 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -616,9 +616,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->lastindex = i/2 + 1; if (i > state->lastmark) { /* state->lastmark is the highest valid index in the - state->mark array. If it is increased by more than 1, - the intervening marks must be set to NULL to signal - that these marks have not been encountered. */ + state->mark array. If it is increased by more than 1, + the intervening marks must be set to NULL to signal + that these marks have not been encountered. */ int j = state->lastmark + 1; while (j < i) state->mark[j++] = NULL;