-
Couldn't load subscription status.
- Fork 5.2k
Replace nop regex loops with empty #118079
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@MihuBot regexdiff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR optimizes regex patterns by removing no-op loops that contain zero-width assertions with a minimum bound of 0. When loops wrap zero-width assertions (like lookarounds, anchors, or word boundaries) and have a minimum iteration count of 0, they can be completely eliminated since the assertion either applies or doesn't - the loop adds no meaningful behavior.
Key changes:
- Added logic to detect and remove loops containing zero-width assertions when minimum bound is 0
- Updated test expectations to reflect improved optimization capabilities
- Added comprehensive test cases covering various zero-width assertion scenarios
Reviewed Changes
Copilot reviewed 3 out of 3 changed files in this pull request and generated 4 comments.
| File | Description |
|---|---|
| RegexNode.cs | Core optimization logic to detect and eliminate no-op loops with zero-width assertions |
| RegexReductionTests.cs | Added test cases for various nop loop patterns and non-removable loop edge cases |
| RegexFindOptimizationsTests.cs | Updated test expectations to reflect improved anchor detection after loop removal |
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
Outdated
Show resolved
Hide resolved
|
Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions |
When loop bodies end up containing zero-width assertions and the loop has a min bound of 0, the whole loop can be removed, as the zero-width assertion may or may not apply.
4986539 to
9202fff
Compare
|
38 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"(?<!\\\\)?\".*?(?<!\\\\)\"" (89 uses)[GeneratedRegex("(?<!\\\\)?\".*?(?<!\\\\)\"")] /// <code>(?<!\\\\)?".*?(?<!\\\\)"</code><br/>
/// Explanation:<br/>
/// <code>
- /// ○ Optional (greedy).<br/>
- /// ○ Zero-width negative lookbehind.<br/>
- /// ○ Match '\\' right-to-left.<br/>
/// ○ Match '"'.<br/>
/// ○ Match a character other than '\n' lazily any number of times.<br/>
/// ○ Zero-width negative lookbehind.<br/>
int pos = base.runtextpos;
int matchStart = pos;
int lazyloop_pos = 0;
- int loop_iteration = 0, loop_starting_pos = 0;
- int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
- // Optional (greedy).
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width negative lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '\\' right-to-left.
- if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '\\')
- {
- goto NegativeLookaroundMatch;
- }
- pos--;
-
- goto LoopIterationNoMatch;
-
- NegativeLookaroundMatch:
- pos = negativelookbehind__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has an upper bound of 1. Continue iterating greedily if the upper bound hasn't
- // yet been reached (as long as the last iteration wasn't empty).
- if (pos != loop_starting_pos && loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
- slice = inputSpan.Slice(pos);
- LoopEnd:;
- //}
-
// Match '"'.
if (slice.IsEmpty || slice[0] != '"')
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// Match a character other than '\n' lazily any number of times.
slice = inputSpan.Slice(pos);
if (slice.IsEmpty || slice[0] == '\n')
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
pos++;
slice = inputSpan.Slice(pos);
// Zero-width negative lookbehind.
{
slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos1 = pos;
+ int negativelookbehind__starting_pos = pos;
if (Utilities.s_hasTimeout)
{
base.CheckTimeout();
// Match '\\' right-to-left.
if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '\\')
{
- goto NegativeLookaroundMatch1;
+ goto NegativeLookaroundMatch;
}
pos--;
goto LazyLoopBacktrack;
- NegativeLookaroundMatch1:
- pos = negativelookbehind__starting_pos1;
+ NegativeLookaroundMatch:
+ pos = negativelookbehind__starting_pos;
slice = inputSpan.Slice(pos);
}
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
-
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 2 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- // If there's space available for all 2 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 1) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- pos += 2;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1);
-
- // <summary>Resize the backtracking stack array and push 2 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- Array.Resize(ref stack, (pos + 1) * 2);
- StackPush(ref stack, ref pos, arg0, arg1);
- }
- }
}
}"(?<!\\\\)?'.{1,2}?(?<!\\\\)'" (89 uses)[GeneratedRegex("(?<!\\\\)?'.{1,2}?(?<!\\\\)'")] /// <code>(?<!\\\\)?'.{1,2}?(?<!\\\\)'</code><br/>
/// Explanation:<br/>
/// <code>
- /// ○ Optional (greedy).<br/>
- /// ○ Zero-width negative lookbehind.<br/>
- /// ○ Match '\\' right-to-left.<br/>
/// ○ Match '\''.<br/>
/// ○ Match a character other than '\n' lazily at least 1 and at most 2 times.<br/>
/// ○ Zero-width negative lookbehind.<br/>
int matchStart = pos;
int lazyloop_iteration = 0;
int lazyloop_pos = 0;
- int loop_iteration = 0, loop_starting_pos = 0;
- int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
- // Optional (greedy).
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width negative lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '\\' right-to-left.
- if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '\\')
- {
- goto NegativeLookaroundMatch;
- }
- pos--;
-
- goto LoopIterationNoMatch;
-
- NegativeLookaroundMatch:
- pos = negativelookbehind__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has an upper bound of 1. Continue iterating greedily if the upper bound hasn't
- // yet been reached (as long as the last iteration wasn't empty).
- if (pos != loop_starting_pos && loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
- slice = inputSpan.Slice(pos);
- LoopEnd:;
- //}
-
// Match '\''.
if (slice.IsEmpty || slice[0] != '\'')
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// Match a character other than '\n' lazily at least 1 and at most 2 times.
//{
if ((uint)slice.Length < 2 || slice[1] == '\n')
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
pos += 2;
LazyLoopBacktrack:
if (lazyloop_iteration >= 1)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
lazyloop_iteration++;
if (Utilities.s_hasTimeout)
slice = inputSpan.Slice(pos);
if (slice.IsEmpty || slice[0] == '\n')
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
pos++;
slice = inputSpan.Slice(pos);
// Zero-width negative lookbehind.
{
slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos1 = pos;
+ int negativelookbehind__starting_pos = pos;
if (Utilities.s_hasTimeout)
{
base.CheckTimeout();
// Match '\\' right-to-left.
if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '\\')
{
- goto NegativeLookaroundMatch1;
+ goto NegativeLookaroundMatch;
}
pos--;
goto LazyLoopBacktrack;
- NegativeLookaroundMatch1:
- pos = negativelookbehind__starting_pos1;
+ NegativeLookaroundMatch:
+ pos = negativelookbehind__starting_pos;
slice = inputSpan.Slice(pos);
}
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
-
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 2 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- // If there's space available for all 2 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 1) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- pos += 2;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1);
-
- // <summary>Resize the backtracking stack array and push 2 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- Array.Resize(ref stack, (pos + 1) * 2);
- StackPush(ref stack, ref pos, arg0, arg1);
- }
- }
}
}"(?<day>01|02|03|04|05|06|07|08|09|10|11|12|1 ..." (61 uses)[GeneratedRegex("(?<day>01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|1|20|21|22|23|24|25|26|27|28|29|2|30|31|3|4|5|6|7|8|9)日?(?=\\b|t|まで|から)?", RegexOptions.ExplicitCapture | RegexOptions.Singleline)] /// ○ Match '3'.<br/>
/// ○ Match a character in the set [01].<br/>
/// ○ Match a character in the set [3-9].<br/>
- /// ○ Match '日' greedily, optionally.<br/>
- /// ○ Optional (greedy).<br/>
- /// ○ Zero-width positive lookahead.<br/>
- /// ○ Match with 4 alternative expressions, atomically.<br/>
- /// ○ Match if at a word boundary.<br/>
- /// ○ Match 't'.<br/>
- /// ○ Match the string "まで".<br/>
- /// ○ Match the string "から".<br/>
+ /// ○ Match '日' atomically, optionally.<br/>
/// </code>
/// </remarks>
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
int alternation_starting_pos1 = 0;
int alternation_starting_pos2 = 0;
int capture_starting_pos = 0;
- int charloop_capture_pos = 0;
- int charloop_starting_pos = 0, charloop_ending_pos = 0;
- int loop_iteration = 0, loop_starting_pos = 0;
- int stackpos = 0;
- int startingStackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// "day" capture group.
CaptureSkipBacktrack:;
//}
- // Match '日' greedily, optionally.
- //{
- charloop_starting_pos = pos;
-
+ // Match '日' atomically, optionally.
+ {
if (!slice.IsEmpty && slice[0] == '日')
{
slice = slice.Slice(1);
pos++;
}
-
- charloop_ending_pos = pos;
- goto CharLoopEnd;
-
- CharLoopBacktrack:
- UncaptureUntil(charloop_capture_pos);
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- if (charloop_starting_pos >= charloop_ending_pos)
- {
- goto CaptureBacktrack;
- }
- pos = --charloop_ending_pos;
- slice = inputSpan.Slice(pos);
-
- CharLoopEnd:
- charloop_capture_pos = base.Crawlpos();
- //}
-
- // Optional (greedy).
- {
- startingStackpos = stackpos;
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width positive lookahead.
- {
- int positivelookahead_starting_pos = pos;
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match with 4 alternative expressions, atomically.
- {
- int alternation_starting_pos3 = pos;
-
- // Branch 0
- {
- // Match if at a word boundary.
- if (!Utilities.IsBoundary(inputSpan, pos))
- {
- goto AlternationBranch6;
- }
-
- goto AlternationMatch3;
-
- AlternationBranch6:
- pos = alternation_starting_pos3;
- slice = inputSpan.Slice(pos);
- }
-
- // Branch 1
- {
- // Match 't'.
- if (slice.IsEmpty || slice[0] != 't')
- {
- goto AlternationBranch7;
- }
-
- pos++;
- slice = inputSpan.Slice(pos);
- goto AlternationMatch3;
-
- AlternationBranch7:
- pos = alternation_starting_pos3;
- slice = inputSpan.Slice(pos);
- }
-
- // Branch 2
- {
- // Match the string "まで".
- if (!slice.StartsWith("まで"))
- {
- goto AlternationBranch8;
- }
-
- pos += 2;
- slice = inputSpan.Slice(pos);
- goto AlternationMatch3;
-
- AlternationBranch8:
- pos = alternation_starting_pos3;
- slice = inputSpan.Slice(pos);
- }
-
- // Branch 3
- {
- // Match the string "から".
- if (!slice.StartsWith("から"))
- {
- goto LoopIterationNoMatch;
- }
-
- pos += 2;
- slice = inputSpan.Slice(pos);
- }
-
- AlternationMatch3:;
- }
-
- pos = positivelookahead_starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has an upper bound of 1. Continue iterating greedily if the upper bound hasn't
- // yet been reached (as long as the last iteration wasn't empty).
- if (pos != loop_starting_pos && loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- goto CharLoopBacktrack;
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
- UncaptureUntil(base.runstack![--stackpos]);
- slice = inputSpan.Slice(pos);
- LoopEnd:
- stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
}
// The input matched.
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
-
- /// <summary>Determines whether the specified index is a boundary.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)
- {
- int indexMinus1 = index - 1;
- return ((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=
- ((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));
-
- static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\u200C' | ch == '\u200D');
- }
-
- /// <summary>Determines whether the character is part of the [\w] set.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static bool IsWordChar(char ch)
- {
- // Mask of Unicode categories that combine to form [\w]
- const int WordCategoriesMask =
- 1 << (int)UnicodeCategory.UppercaseLetter |
- 1 << (int)UnicodeCategory.LowercaseLetter |
- 1 << (int)UnicodeCategory.TitlecaseLetter |
- 1 << (int)UnicodeCategory.ModifierLetter |
- 1 << (int)UnicodeCategory.OtherLetter |
- 1 << (int)UnicodeCategory.NonSpacingMark |
- 1 << (int)UnicodeCategory.DecimalDigitNumber |
- 1 << (int)UnicodeCategory.ConnectorPunctuation;
-
- // Bitmap for whether each character 0 through 127 is in [\w]
- ReadOnlySpan<byte> ascii = new byte[]
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
- 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07
- };
-
- // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.
- int chDiv8 = ch >> 3;
- return (uint)chDiv8 < (uint)ascii.Length ?
- (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
- (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
- }
-
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 3 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- // If there's space available for all 3 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 2) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- s[p + 2] = arg2;
- pos += 3;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1, arg2);
-
- // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- Array.Resize(ref stack, (pos + 2) * 2);
- StackPush(ref stack, ref pos, arg0, arg1, arg2);
- }
- }
}
}For more diff examples, see https://gist.github.com/MihuBot/a78f3bd6a086ff6dbc537a93cce8b69b For a list of JIT diff regressions, see Regressions.md Sample source code for further analysisconst string JsonPath = "RegexResults-1283.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2hp_ZI");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
} |
|
/ba-g dead letter queue |
When loop bodies end up containing zero-width assertions and the loop has a min bound of 0, the whole loop can be removed, as the zero-width assertion may or may not apply.