diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 1acada27f5179a..c2342a27be41ef 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -668,18 +668,44 @@ void EmitAnchorAndLeadingChecks() private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) { Debug.Assert(rm.Code.Tree.Root.Type == RegexNode.Capture); + if ((rm.Options & RegexOptions.NonBacktracking) != 0) { EmitNonBacktrackingGo(writer, rm, id); + return; } - else if (RegexNode.NodeSupportsSimplifiedCodeGenerationImplementation(rm.Code.Tree.Root.Child(0), RegexNode.DefaultMaxRecursionDepth) && - (((RegexOptions)rm.Code.Tree.Root.Options) & RegexOptions.RightToLeft) == 0) + RegexNode root = rm.Code.Tree.Root; + if (!ExceedsMaxDepthForSimpleCodeGeneration(root) && + root.Child(0).SupportsSimplifiedCodeGenerationImplementation() && + (((RegexOptions)root.Options) & RegexOptions.RightToLeft) == 0) { EmitSimplifiedGo(writer, rm, id); + return; } - else + + EmitCompleteGo(writer, rm, id); + + // Deep RegexNode trees used with the simplified code generator can result in + // emitting C# code that exceeds C# compiler limitations, leading to "CS8078: An + // expression is too long or complex to compile". Place an artificial limit on + // max tree depth in order to mitigate such issues. + static bool ExceedsMaxDepthForSimpleCodeGeneration(RegexNode node, int maxDepth = 30) { - EmitCompleteGo(writer, rm, id); + if (maxDepth <= 0) + { + return true; + } + + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (ExceedsMaxDepthForSimpleCodeGeneration(node.Child(i), maxDepth - 1)) + { + return true; + } + } + + return false; } } @@ -698,8 +724,8 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, bool rtl = code.RightToLeft; bool hasTimeout = false; - int nextLocalId = 0; - string GetNextLocalId() => $"i{nextLocalId++}"; ; + int localCounter = 0; + string NextLocalName(string prefix) => $"{prefix}{localCounter++}"; RegexNode node = rm.Code.Tree.Root; Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); @@ -708,6 +734,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); + // If there's any backtracking in the expression, nodes may emit labels that their peers + // need to jump to. Scopes (which we emit for readability) get in the way of that. As such, + // for nodes that emit such labels, we emit faux, commented-out scopes instead. + HashSet nodesWithCrossScopeLabels = NodesWithCrossScopeLabels(node); + // In some limited cases, FindFirstChar will only return true if it successfully matched the whole thing. // This is the case, in particular, for strings. We can special case these to do essentially nothing // in Go other than emit the capture. @@ -750,16 +781,15 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, LoadTextSpanLocal(writer, defineLocal: true); writer.WriteLine(); - int localCounter = 0; - string NextLocalName(string prefix) => $"{prefix}{localCounter++}"; - int labelCounter = 0; - string DefineLabel() => $"L{labelCounter++}"; + string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; void MarkLabel(string label) => writer.WriteLine($"{label}:"); void Goto(string label) => writer.WriteLine($"goto {label};"); string doneLabel = "NoMatch"; + string originalDoneLabel = doneLabel; // Emit the code for all nodes in the tree. + bool expressionHasCaptures = (node.Options & RegexNode.HasCapturesFlag) != 0; EmitNode(node); // Emit success @@ -775,10 +805,10 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Emit failure writer.WriteLine("// No match"); - MarkLabel(doneLabel); - if ((node.Options & RegexNode.HasCapturesFlag) != 0) + MarkLabel(originalDoneLabel); + if (expressionHasCaptures) { - writer.WriteLine("while (base.Crawlpos() != 0) base.Uncapture();"); + EmitUncaptureUntil("0"); } else { @@ -945,7 +975,7 @@ static RegexNode CloneMultiWithoutFirstChar(RegexNode node) void EmitAllBranches() { // Label to jump to when any branch completes successfully. - string doneAlternateLabel = DefineLabel(); + string doneAlternateLabel = DefineLabel("Match"); // Save off runtextpos. We'll need to reset this each time a branch fails. string startingRunTextPosName = NextLocalName("startingRunTextPos"); @@ -957,9 +987,10 @@ void EmitAllBranches() // as the alternation is atomic, so we're not concerned about captures after // the alternation. bool hasStartingCrawlpos = (node.Options & RegexNode.HasCapturesFlag) != 0; + string startingCrawlPos = NextLocalName("startingCrawlPos"); if (hasStartingCrawlpos) { - writer.WriteLine("int startingCrawlpos = base.Crawlpos();"); + writer.WriteLine($"int {startingCrawlPos} = base.Crawlpos();"); } writer.WriteLine(); @@ -971,7 +1002,7 @@ void EmitAllBranches() { using var __ = EmitScope(writer, $"Branch {i}"); - string nextBranch = DefineLabel(); + string nextBranch = DefineLabel("NoMatch"); doneLabel = nextBranch; // Emit the code for each branch. @@ -994,7 +1025,7 @@ void EmitAllBranches() textSpanPos = startingTextSpanPos; if (hasStartingCrawlpos) { - EmitUncaptureUntil(); + EmitUncaptureUntil(startingCrawlPos); } } @@ -1005,14 +1036,14 @@ void EmitAllBranches() { if (hasStartingCrawlpos) { - string uncapture = DefineLabel(); + string uncapture = DefineLabel("Uncapture"); doneLabel = uncapture; EmitNode(node.Child(childCount - 1)); doneLabel = postAlternateDoneLabel; TransferTextSpanPosToRunTextPos(); writer.WriteLine($"goto {doneAlternateLabel};"); MarkLabel(uncapture); - EmitUncaptureUntil(); + EmitUncaptureUntil(startingCrawlPos); writer.WriteLine($"goto {doneLabel};"); } else @@ -1031,7 +1062,7 @@ void EmitAllBranches() } // Emits the code for a Capture node. - void EmitCapture(RegexNode node) + void EmitCapture(RegexNode node, RegexNode? subsequent = null) { Debug.Assert(node.N == -1); @@ -1049,16 +1080,19 @@ void EmitCapture(RegexNode node) writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); // Emit child node. - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); TransferTextSpanPosToRunTextPos(); writer.WriteLine($"base.Capture({capnum}, {startingRunTextPosName}, runtextpos);"); } // Emits code to unwind the capture stack until the crawl position specified in the provided local. - void EmitUncaptureUntil() + void EmitUncaptureUntil(string crawlpos) { - writer.WriteLine("while (base.Crawlpos() != startingCrawlpos) base.Uncapture();"); + using (EmitBlock(writer, $"while (base.Crawlpos() != {crawlpos})")) + { + writer.WriteLine("base.Uncapture();"); + } } // Emits the code to handle a positive lookahead assertion. @@ -1088,7 +1122,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) int startingTextSpanPos = textSpanPos; string originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + string negativeLookaheadDoneLabel = DefineLabel("Match"); + doneLabel = negativeLookaheadDoneLabel; // Emit the child. EmitNode(node.Child(0)); @@ -1098,7 +1133,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) Goto(originalDoneLabel); // Failures (success for a negative lookahead) jump here. - MarkLabel(doneLabel); + MarkLabel(negativeLookaheadDoneLabel); + Debug.Assert(doneLabel == negativeLookaheadDoneLabel); doneLabel = originalDoneLabel; // After the child completes in failure (success for negative lookahead), reset the text positions. @@ -1110,8 +1146,14 @@ void EmitNegativeLookaheadAssertion(RegexNode node) static string DescribeNode(RegexNode node) => SymbolDisplay.FormatLiteral(node.Description(), quote: false); // Emits the code for the node. - void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + StackHelper.CallOnEmptyStack(EmitNode, node, subsequent, emitLengthChecksIfRequired); + return; + } + // Separate out several node types that, for conciseness, don't need a header and scope written into the source. switch (node.Type) { @@ -1119,12 +1161,13 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) return; case RegexNode.Atomic: - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); return; } - // Put the node's code into its own scope - using var _ = EmitScope(writer, DescribeNode(node)); + // Put the node's code into its own scope. If the node contains labels that may need to + // be visible outside of its scope, the scope is still emitted for clarity but is commented out. + using var _ = EmitScope(writer, DescribeNode(node), nodesWithCrossScopeLabels.Contains(node)); switch (node.Type) { @@ -1180,12 +1223,15 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Oneloop: - case RegexNode.Onelazy: case RegexNode.Notoneloop: - case RegexNode.Notonelazy: case RegexNode.Setloop: + EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); + break; + + case RegexNode.Onelazy: + case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -1199,19 +1245,20 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) for (; i < exclusiveEnd; i++) { - EmitNode(node.Child(i), emitLengthChecksIfRequired: false); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); } i--; - continue; } - - EmitNode(node.Child(i), emitLengthChecksIfRequired); + else + { + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: emitLengthChecksIfRequired); + } } break; case RegexNode.Capture: - EmitCapture(node); + EmitCapture(node, subsequent); break; case RegexNode.Require: @@ -1236,6 +1283,42 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) } } + /// + /// Provides a set of all the nodes in the node tree that contains a node + /// which triggers backtracking and thus may emit labels that peer nodes need + /// to be able to see. + /// + static HashSet NodesWithCrossScopeLabels(RegexNode node) + { + var results = new HashSet(); + NodesWithCrossScopeLabels(node, results); + return results; + + static bool NodesWithCrossScopeLabels(RegexNode node, HashSet results) + { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + return StackHelper.CallOnEmptyStack(NodesWithCrossScopeLabels, node, results); + } + + // Nodes that trigger backtracking and thus may emit labels that need to be reached by non-descendants. + bool contains = node.InstigatesBacktracking; + + int childcount = node.ChildCount(); + for (int i = 0; i < childcount; i++) + { + contains |= NodesWithCrossScopeLabels(node.Child(i), results); + } + + if (contains) + { + results.Add(node); + } + + return contains; + } + } + // Emits the code to handle updating base.runtextpos to runtextpos in response to // an UpdateBumpalong node. This is used when we want to inform the scan loop that // it should bump from this location rather than from the original location. @@ -1248,33 +1331,20 @@ void EmitUpdateBumpalong() // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset = null) { - string expr = $"{textSpanLocal}[{Sum(textSpanPos, offset)}]"; - switch (node.Type) - { - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we map those looping constructs to the - // appropriate single check. + // This only emits a single check, but it's called from the looping constructs in a loop + // to generate the code for a single check, so we map those looping constructs to the + // appropriate single check. - case RegexNode.Set: - case RegexNode.Setlazy: - case RegexNode.Setloop: - case RegexNode.Setloopatomic: - expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node))}"; - break; - - case RegexNode.One: - case RegexNode.Onelazy: - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} != {Literal(node.Ch)}"; - break; + string expr = $"{textSpanLocal}[{Sum(textSpanPos, offset)}]"; - default: - Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopatomic); - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} == {Literal(node.Ch)}"; - break; + if (node.IsSetFamily) + { + expr = $"!{MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node))}"; + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "!=" : "==")} {Literal(node.Ch)}"; } using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) @@ -1393,10 +1463,6 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) { // Unroll shorter strings. - // TODO: This might employ 64-bit operations on a 32-bit machine. Decide if avoiding that - // is worth adding further complexity for (RegexOptions.Compiled doesn't have to deal with - // this, as the machine generating the code in-memory is the same one running it.) - // For strings more than two characters and when performing case-sensitive searches, we try to do fewer comparisons // by comparing 2 or 4 characters at a time. Because we might be compiling on one endianness and running on another, // both little and big endian values are emitted and which is used is selected at run-time. @@ -1476,7 +1542,7 @@ void EmitOr() else { EmitSpanLengthCheck(str.Length); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {Literal(node.Str)}.Length; {i}++)")) { using (EmitBlock(writer, $"if ({ToLower(hasTextInfo, options, $"{textSpanLocal}[{textSpanPos} + {i}]")} != {Literal(str)}[{i}])")) @@ -1489,9 +1555,90 @@ void EmitOr() } } + void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // If this is actually a repeater, emit that instead; no backtracking necessary. + if (node.M == node.N) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + return; + } + + // Emit backtracking around an atomic single char loop. We can then implement the backtracking + // as an afterthought, since we know exactly how many characters are accepted by each iteration + // of the wrapped loop (1). + + Debug.Assert(node.M < node.N); + string backtrackingLabel = DefineLabel("Backtrack"); + string endLoop = DefineLabel("EndLoop"); + string startingPos = NextLocalName("startingRunTextPos"); + string endingPos = NextLocalName("endingRunTextPos"); + string crawlPos = NextLocalName("crawlPos"); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // Grab the current position, then emit the loop as atomic, and then + // grab the current position again. Even though we emit the loop without + // knowledge of backtracking, we can layer it on top by just walking back + // through the individual characters (a benefit of the loop matching exactly + // one character per iteration, no possible captures within the loop, etc.) + writer.WriteLine($"int {startingPos} = runtextpos;"); + EmitSingleCharAtomicLoop(node); + TransferTextSpanPosToRunTextPos(); + writer.WriteLine($"int {endingPos} = runtextpos;"); + writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + if (node.M > 0) + { + writer.WriteLine($"{startingPos} += {node.M};"); + } + writer.WriteLine($"goto {endLoop};"); + writer.WriteLine(); + + // Backtracking section. Subsequent failures will jump to here, at which + // point we decrement the matched count as long as it's above the minimum + // required, and try again by flowing to everything that comes after this. + MarkLabel(backtrackingLabel); + string originalDoneLabel = doneLabel; + using (EmitBlock(writer, $"if ({startingPos} >= {endingPos})")) + { + writer.WriteLine($"goto {originalDoneLabel};"); + } + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos); + } + + if (subsequent?.FindStartingCharacter() is char subsequentCharacter) + { + writer.WriteLine($"{endingPos} = runtext.LastIndexOf({Literal(subsequentCharacter)}, {endingPos} - 1, {endingPos} - {startingPos});"); + using (EmitBlock(writer, $"if ({endingPos} < 0)")) + { + writer.WriteLine($"goto {originalDoneLabel};"); + } + writer.WriteLine($"runtextpos = {endingPos};"); + } + else + { + writer.WriteLine($"runtextpos = --{endingPos};"); + } + + LoadTextSpanLocal(writer); + writer.WriteLine(); + + MarkLabel(endLoop); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. - void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) + void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) { int iterations = node.M; if (iterations == 0) @@ -1525,7 +1672,7 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) { string spanLocal = "slice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything writer.WriteLine($"global::System.ReadOnlySpan {spanLocal} = {textSpanLocal}.Slice({textSpanPos}, {iterations});"); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {spanLocal}.Length; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1561,7 +1708,7 @@ void EmitNodeRepeater(RegexNode node) // Ensure textSpanPos is 0 prior to emitting the child. TransferTextSpanPosToRunTextPos(); - string i = GetNextLocalId(); + string i = NextLocalName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {iterations}; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1574,15 +1721,10 @@ void EmitNodeRepeater(RegexNode node) // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison. void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = true) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); - // If this is actually a repeater, emit that instead. if (node.M == node.N) { - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); return; } @@ -1600,12 +1742,12 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny int numSetChars = 0; - string iterationLocal = "i"; // No need for a dynamically named value, as no other 'i' can be in scope - if (node.Type == RegexNode.Notoneloopatomic && + string iterationLocal = NextLocalName("i"); + if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) { - // For Notoneloopatomic, we're looking for a specific character, as everything until we find + // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, // we can use the vectorized IndexOf to do the search, rather than open-coding it. The unbounded // restriction is purely for simplicity; it could be removed in the future with additional code to @@ -1625,14 +1767,14 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = $"{iterationLocal} = {textSpanLocal}.Length;"); } } - else if (node.Type == RegexNode.Setloopatomic && + else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && RegexCharClass.IsNegated(node.Str!)) { // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would - // have been reduced to a Notoneloopatomic), we can use an IndexOfAny to find any of the target characters. + // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. writer.Write($"int {iterationLocal} = global::System.MemoryExtensions.IndexOfAny({textSpanLocal}"); @@ -1650,10 +1792,10 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = $"{iterationLocal} = {textSpanLocal}.Length;"); } } - else if (node.Type == RegexNode.Setloopatomic && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) + else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. - // The unbounded constraint is the same as in the Notoneloopatomic case above, done purely for simplicity. + // The unbounded constraint is the same as in the Notone case above, done purely for simplicity. // int i = runtextend - runtextpos; TransferTextSpanPosToRunTextPos(); @@ -1664,22 +1806,26 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // For everything else, do a normal loop. string expr = $"{textSpanLocal}[{iterationLocal}]"; - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - case RegexNode.Notoneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} {(node.Type == RegexNode.Oneloopatomic ? "==" : "!=")} {Literal(node.Ch)}"; - break; - case RegexNode.Setloopatomic: - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); - break; + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } - // Transfer text pos to runtextpos to help with bounds check elimination on the loop. - TransferTextSpanPosToRunTextPos(); + if (minIterations != 0 || maxIterations != int.MaxValue) + { + // For any loops other than * loops, transfer text pos to runtextpos in + // order to zero it out to be able to use the single iteration variable + // for both iteration count and indexer. + TransferTextSpanPosToRunTextPos(); + } - writer.WriteLine($"int {iterationLocal} = 0;"); + writer.WriteLine($"int {iterationLocal} = {textSpanPos};"); + textSpanPos = 0; string maxClause = maxIterations != int.MaxValue ? $"{iterationLocal} < {maxIterations} && " : ""; using (EmitBlock(writer, $"while ({maxClause}(uint){iterationLocal} < (uint){textSpanLocal}.Length && {expr})")) @@ -1708,29 +1854,17 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // Emits the code to handle a non-backtracking optional zero-or-one loop. void EmitAtomicSingleCharZeroOrOne(RegexNode node) { - string skipUpdatesLabel = DefineLabel(); - - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); Debug.Assert(node.M == 0 && node.N == 1); string expr = $"{textSpanLocal}[{textSpanPos}]"; - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} == {Literal(node.Ch)}"; - break; - case RegexNode.Notoneloopatomic: - expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); - expr = $"{expr} != {Literal(node.Ch)}"; - break; - case RegexNode.Setloopatomic: - expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); - expr = $"{expr}"; - break; + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node)); + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)); + expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } using (EmitBlock(writer, $"if ((uint){textSpanPos} < (uint){textSpanLocal}.Length && {expr})")) @@ -1757,7 +1891,8 @@ void EmitAtomicNodeLoop(RegexNode node) } string originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + string atomicNodeLabel = DefineLabel("NoMatch"); + doneLabel = atomicNodeLabel; // We might loop any number of times. In order to ensure this loop // and subsequent code sees textSpanPos the same regardless, we always need it to contain @@ -1772,12 +1907,13 @@ void EmitAtomicNodeLoop(RegexNode node) using (EmitBlock(writer, maxIterations == int.MaxValue ? "while (true)" : $"while ({iterationLocal} < {maxIterations})")) { EmitTimeoutCheck(writer, hasTimeout); - string successfulIterationLabel = DefineLabel(); + string successfulIterationLabel = DefineLabel("Match"); // Iteration body string prevDone = doneLabel; - doneLabel = DefineLabel(); + string iterationLabel = DefineLabel("NoMatch"); + doneLabel = iterationLabel; // Save off runtextpos. string startingRunTextPosLocal = NextLocalName("startingRunTextPos"); @@ -1791,7 +1927,8 @@ void EmitAtomicNodeLoop(RegexNode node) // If the generated code gets here, the iteration failed. // Reset state, branch to done. - MarkLabel(doneLabel); + MarkLabel(iterationLabel); + Debug.Assert(doneLabel == iterationLabel); doneLabel = prevDone; // reset done label writer.WriteLine($"runtextpos = {startingRunTextPosLocal};"); Goto(doneLabel); @@ -1801,26 +1938,23 @@ void EmitAtomicNodeLoop(RegexNode node) writer.WriteLine($"{iterationLocal}++;"); } + // Done: + MarkLabel(atomicNodeLabel); + Debug.Assert(doneLabel == atomicNodeLabel); + doneLabel = originalDoneLabel; + // Check to ensure we've found at least min iterations. if (minIterations > 0) { - // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label using (EmitBlock(writer, $"if ({iterationLocal} < {minIterations})")) { - writer.WriteLine($"goto {doneLabel};"); + writer.WriteLine($"goto {originalDoneLabel};"); } } - - // We can't have a label in front of a closing brace, so if we didn't emit the label - // earlier, emit now that we've closed out the scope. - if (minIterations <= 0) + else { - // Done: - MarkLabel(doneLabel); + // Labels require a statement after them. writer.WriteLine(";"); - doneLabel = originalDoneLabel; // Restore the original done label } } } @@ -3261,17 +3395,17 @@ private static string MatchCharacterClass(bool hasTextInfo, RegexOptions options private static string Literal(string s) => SymbolDisplay.FormatLiteral(s, quote: true); - private static FinishEmitScope EmitScope(IndentedTextWriter writer, string title) => EmitBlock(writer, $"// {title}", appendBlankLine: true); + private static FinishEmitScope EmitScope(IndentedTextWriter writer, string title, bool faux = false) => EmitBlock(writer, $"// {title}", appendBlankLine: true, faux); - private static FinishEmitScope EmitBlock(IndentedTextWriter writer, string? clause, bool appendBlankLine = false) + private static FinishEmitScope EmitBlock(IndentedTextWriter writer, string? clause, bool appendBlankLine = false, bool faux = false) { if (clause is not null) { writer.WriteLine(clause); } - writer.WriteLine("{"); + writer.WriteLine(faux ? "//{" : "{"); writer.Indent++; - return new FinishEmitScope(writer, appendBlankLine); + return new FinishEmitScope(writer, appendBlankLine, faux); } private static void EmitAdd(IndentedTextWriter writer, string variable, int value) @@ -3293,11 +3427,13 @@ private static void EmitAdd(IndentedTextWriter writer, string variable, int valu { private readonly IndentedTextWriter _writer; private readonly bool _appendBlankLine; + private readonly bool _faux; - public FinishEmitScope(IndentedTextWriter writer, bool appendBlankLine) + public FinishEmitScope(IndentedTextWriter writer, bool appendBlankLine, bool faux) { _writer = writer; _appendBlankLine = appendBlankLine; + _faux = faux; } public void Dispose() @@ -3305,7 +3441,7 @@ public void Dispose() if (_writer is not null) { _writer.Indent--; - _writer.WriteLine("}"); + _writer.WriteLine(_faux ? "//}" : "}"); if (_appendBlankLine) { _writer.WriteLine(); diff --git a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj index cb7e816f76f4c5..7f59e37493cd85 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj +++ b/src/libraries/System.Text.RegularExpressions/gen/System.Text.RegularExpressions.Generator.csproj @@ -29,6 +29,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index f6da67980d2d18..17f1d0fc877e64 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -7,6 +7,7 @@ + @@ -17,8 +18,8 @@ - + @@ -53,7 +54,6 @@ - diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index fabb0020108ddc..f8cae4f7b7e932 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -8,6 +8,7 @@ using System.Reflection; using System.Reflection.Emit; using System.Runtime.InteropServices; +using System.Threading; namespace System.Text.RegularExpressions { @@ -67,6 +68,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_stringAsSpanIntIntMethod = typeof(MemoryExtensions).GetMethod("AsSpan", new Type[] { typeof(string), typeof(int), typeof(int) })!; private static readonly MethodInfo s_stringGetCharsMethod = typeof(string).GetMethod("get_Chars", new Type[] { typeof(int) })!; private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!; + private static readonly MethodInfo s_stringLastIndexOfCharIntInt = typeof(string).GetMethod("LastIndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!; protected ILGenerator? _ilg; @@ -1676,7 +1678,7 @@ protected void GenerateFindFirstChar() } } - private bool TryGenerateNonBacktrackingGo(RegexNode node) + private bool TryGenerateSimplifiedGo(RegexNode node) { Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); @@ -1689,7 +1691,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) // Skip the Capture node. We handle the implicit root capture specially. node = node.Child(0); - if (!RegexNode.NodeSupportsSimplifiedCodeGenerationImplementation(node, maxDepth: RegexNode.DefaultMaxRecursionDepth)) + if (!node.SupportsSimplifiedCodeGenerationImplementation()) { return false; } @@ -1741,6 +1743,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) LocalBuilder runtextendLocal = DeclareInt32(); Label stopSuccessLabel = DefineLabel(); Label doneLabel = DefineLabel(); + Label originalDoneLabel = doneLabel; if (_hasTimeout) { _loopTimeoutCounterLocal = DeclareInt32(); @@ -1771,6 +1774,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) LoadTextSpanLocal(); // Emit the code for all nodes in the tree. + bool expressionHasCaptures = (node.Options & RegexNode.HasCapturesFlag) != 0; EmitNode(node); // Success: @@ -1795,14 +1799,14 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) Call(s_captureMethod); // If the graph contained captures, undo any remaining to handle failed matches. - if ((node.Options & RegexNode.HasCapturesFlag) != 0) + if (expressionHasCaptures) { // while (Crawlpos() != 0) Uncapture(); Label finalReturnLabel = DefineLabel(); Br(finalReturnLabel); - MarkLabel(doneLabel); + MarkLabel(originalDoneLabel); Label condition = DefineLabel(); Label body = DefineLabel(); Br(condition); @@ -1820,7 +1824,7 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node) else { // Done: - MarkLabel(doneLabel); + MarkLabel(originalDoneLabel); } // return; @@ -1936,7 +1940,7 @@ void EmitAtomicAlternate(RegexNode node) // BranchN(); // jumps to Done on failure // Save off runtextpos. We'll need to reset this each time a branch fails. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; @@ -1945,10 +1949,10 @@ void EmitAtomicAlternate(RegexNode node) // state. Note that this is only about subexpressions within the alternation, // as the alternation is atomic, so we're not concerned about captures after // the alternation. - RentedLocalBuilder? startingCrawlpos = null; + LocalBuilder? startingCrawlpos = null; if ((node.Options & RegexNode.HasCapturesFlag) != 0) { - startingCrawlpos = RentInt32Local(); + startingCrawlpos = DeclareInt32(); Ldthis(); Call(s_crawlposMethod); Stloc(startingCrawlpos); @@ -1959,7 +1963,7 @@ void EmitAtomicAlternate(RegexNode node) // A failure in a branch other than the last should jump to the next // branch, not to the final done. - Label postAlternateDone = doneLabel; + Label originalDoneLabel = doneLabel; int childCount = node.ChildCount(); for (int i = 0; i < childCount - 1; i++) @@ -2000,7 +2004,7 @@ void EmitAtomicAlternate(RegexNode node) Label uncapture = DefineLabel(); doneLabel = uncapture; EmitNode(node.Child(childCount - 1)); - doneLabel = postAlternateDone; + doneLabel = originalDoneLabel; TransferTextSpanPosToRunTextPos(); Br(doneAlternate); @@ -2010,24 +2014,21 @@ void EmitAtomicAlternate(RegexNode node) } else { - doneLabel = postAlternateDone; + doneLabel = originalDoneLabel; EmitNode(node.Child(childCount - 1)); TransferTextSpanPosToRunTextPos(); } // Successfully completed the alternate. MarkLabel(doneAlternate); - - startingCrawlpos?.Dispose(); - Debug.Assert(textSpanPos == 0); } // Emits the code for a Capture node. - void EmitCapture(RegexNode node) + void EmitCapture(RegexNode node, RegexNode? subsequent = null) { Debug.Assert(node.N == -1); - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); // Get the capture number. This needs to be kept // in sync with MapCapNum in RegexWriter. @@ -2047,7 +2048,7 @@ void EmitCapture(RegexNode node) Stloc(startingRunTextPos); // Emit child node. - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); // runtextpos += textSpanPos; // textSpan = textSpan.Slice(textSpanPos); @@ -2083,7 +2084,7 @@ void EmitUncaptureUntil(LocalBuilder startingCrawlpos) void EmitPositiveLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; @@ -2103,13 +2104,14 @@ void EmitPositiveLookaheadAssertion(RegexNode node) void EmitNegativeLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - using RentedLocalBuilder startingRunTextPos = RentInt32Local(); + LocalBuilder startingRunTextPos = DeclareInt32(); Ldloc(runtextposLocal); Stloc(startingRunTextPos); int startingTextSpanPos = textSpanPos; Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label negativeLookaheadDoneLabel = DefineLabel(); + doneLabel = negativeLookaheadDoneLabel; // Emit the child. EmitNode(node.Child(0)); @@ -2119,7 +2121,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node) BrFar(originalDoneLabel); // Failures (success for a negative lookahead) jump here. - MarkLabel(doneLabel); + MarkLabel(negativeLookaheadDoneLabel); + Debug.Assert(doneLabel == negativeLookaheadDoneLabel); doneLabel = originalDoneLabel; // After the child completes in failure (success for negative lookahead), reset the text positions. @@ -2130,8 +2133,14 @@ void EmitNegativeLookaheadAssertion(RegexNode node) } // Emits the code for the node. - void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + StackHelper.CallOnEmptyStack(EmitNode, node, subsequent, emitLengthChecksIfRequired); + return; + } + switch (node.Type) { case RegexNode.One: @@ -2182,7 +2191,7 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Atomic: - EmitNode(node.Child(0)); + EmitNode(node.Child(0), subsequent); break; case RegexNode.Alternate: @@ -2190,12 +2199,15 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) break; case RegexNode.Oneloop: - case RegexNode.Onelazy: case RegexNode.Notoneloop: - case RegexNode.Notonelazy: case RegexNode.Setloop: + EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); + break; + + case RegexNode.Onelazy: + case RegexNode.Notonelazy: case RegexNode.Setlazy: - EmitSingleCharRepeater(node, emitLengthChecksIfRequired); + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); break; case RegexNode.Concatenate: @@ -2207,19 +2219,19 @@ void EmitNode(RegexNode node, bool emitLengthChecksIfRequired = true) EmitSpanLengthCheck(requiredLength); for (; i < exclusiveEnd; i++) { - EmitNode(node.Child(i), emitLengthChecksIfRequired: false); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent, emitLengthChecksIfRequired: false); } i--; continue; } - EmitNode(node.Child(i)); + EmitNode(node.Child(i), i + 1 < childCount ? node.Child(i + 1) : subsequent); } break; case RegexNode.Capture: - EmitCapture(node); + EmitCapture(node, subsequent); break; case RegexNode.Require: @@ -2263,6 +2275,10 @@ void EmitUpdateBumpalong() // Emits the code to handle a single-character match. void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? offset = null) { + // This only emits a single check, but it's called from the looping constructs in a loop + // to generate the code for a single check, so we check for each "family" (one, notone, set) + // rather than only for the specific single character nodes. + // if ((uint)(textSpanPos + offset) >= textSpan.Length || textSpan[textSpanPos + offset] != ch) goto Done; if (emitLengthCheck) { @@ -2272,41 +2288,26 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o EmitSum(textSpanPos, offset); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we map those looping constructs to the - // appropriate single check. - - case RegexNode.Set: - case RegexNode.Setlazy: - case RegexNode.Setloop: - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(doneLabel); - break; - - case RegexNode.One: - case RegexNode.Onelazy: - case RegexNode.Oneloop: - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(doneLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { BneFar(doneLabel); - break; - - default: - Debug.Assert(node.Type == RegexNode.Notone || node.Type == RegexNode.Notonelazy || node.Type == RegexNode.Notoneloop || node.Type == RegexNode.Notoneloopatomic); - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + } + else // IsNotoneFamily + { BeqFar(doneLabel); - break; + } } textSpanPos++; @@ -2536,9 +2537,109 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck = true) } } + // Emits the code to handle a backtracking, single-character loop. + void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + { + // If this is actually a repeater, emit that instead; no backtracking necessary. + if (node.M == node.N) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + return; + } + + Debug.Assert(node.M < node.N); + Label backtrackingLabel = DefineLabel(); + Label endLoop = DefineLabel(); + LocalBuilder startingPos = DeclareInt32(); + LocalBuilder endingPos = DeclareInt32(); + LocalBuilder crawlPos = DeclareInt32(); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // int startingPos = runtextpos; + // Single char atomic loop + // int endingPos = runtextpos; + // int crawlPos = base.Crawlpos(); + // startingPos += node.M; + // goto endLoop; + Ldloc(runtextposLocal); + Stloc(startingPos); + EmitSingleCharAtomicLoop(node); + TransferTextSpanPosToRunTextPos(); + Ldloc(runtextposLocal); + Stloc(endingPos); + Ldthis(); + Call(s_crawlposMethod); + Stloc(crawlPos); + if (node.M > 0) + { + Ldloc(startingPos); + Ldc(node.M); + Add(); + Stloc(startingPos); + } + Br(endLoop); + + // Backtracking: + // if (startingPos >= endingPos) goto doneLabel; + MarkLabel(backtrackingLabel); + Ldloc(startingPos); + Ldloc(endingPos); + BgeFar(doneLabel); + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + // while (base.Crawlpos() != crawlPos) Uncapture(); + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos); + } + + if (subsequent?.FindStartingCharacter() is char subsequentCharacter) + { + // endingPos = runtext.LastIndexOf(subsequentCharacter, endingPos - 1, endingPos - startingPos); + // if (endingPos < 0) + // { + // goto doneLabel; + // } + Ldloc(runtextLocal); + Ldc(subsequentCharacter); + Ldloc(endingPos); + Ldc(1); + Sub(); + Ldloc(endingPos); + Ldloc(startingPos); + Sub(); + Call(s_stringLastIndexOfCharIntInt); + Stloc(endingPos); + Ldloc(endingPos); + Ldc(0); + BltFar(doneLabel); + } + else + { + // endingPos--; + Ldloc(endingPos); + Ldc(1); + Sub(); + Stloc(endingPos); + } + + // runtextpos = endingPos; + Ldloc(endingPos); + Stloc(runtextposLocal); + + // textspan = runtext.AsSpan(runtextpos, runtextend - runtextpos); + LoadTextSpanLocal(); + + MarkLabel(endLoop); + } + // Emits the code to handle a loop (repeater) with a fixed number of iterations. // RegexNode.M is used for the number of iterations; RegexNode.N is ignored. - void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) + void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true) { int iterations = node.M; @@ -2648,7 +2749,7 @@ void EmitNodeRepeater(RegexNode node) Label conditionLabel = DefineLabel(); Label bodyLabel = DefineLabel(); - using RentedLocalBuilder iterationLocal = RentInt32Local(); + LocalBuilder iterationLocal = DeclareInt32(); Ldc(0); Stloc(iterationLocal); BrFar(conditionLabel); @@ -2675,15 +2776,10 @@ void EmitNodeRepeater(RegexNode node) // Emits the code to handle a non-backtracking, variable-length loop around a single character comparison. void EmitSingleCharAtomicLoop(RegexNode node) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); - // If this is actually a repeater, emit that instead. if (node.M == node.N) { - EmitSingleCharRepeater(node); + EmitSingleCharFixedRepeater(node); return; } @@ -2700,17 +2796,16 @@ void EmitSingleCharAtomicLoop(RegexNode node) using RentedLocalBuilder iterationLocal = RentInt32Local(); - Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label atomicLoopDoneLabel = DefineLabel(); Span setChars = stackalloc char[3]; // 3 is max we can use with IndexOfAny int numSetChars = 0; - if (node.Type == RegexNode.Notoneloopatomic && + if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node) || !RegexCharClass.ParticipatesInCaseConversion(node.Ch))) { - // For Notoneloopatomic, we're looking for a specific character, as everything until we find + // For Notone, we're looking for a specific character, as everything until we find // it is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, // we can use the vectorized IndexOf to do the search, rather than open-coding it. The unbounded // restriction is purely for simplicity; it could be removed in the future with additional code to @@ -2731,10 +2826,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) Call(s_spanIndexOf); Stloc(iterationLocal); - // if (i != -1) goto doneLabel; + // if (i >= 0) goto atomicLoopDoneLabel; Ldloc(iterationLocal); - Ldc(-1); - BneFar(doneLabel); + Ldc(0); + BgeFar(atomicLoopDoneLabel); // i = textSpan.Length - textSpanPos; Ldloca(textSpanLocal); @@ -2746,14 +2841,14 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); } - else if (node.Type == RegexNode.Setloopatomic && + else if (node.IsSetFamily && maxIterations == int.MaxValue && !IsCaseInsensitive(node) && (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) > 1 && RegexCharClass.IsNegated(node.Str!)) { // If the set is negated and contains only 2 or 3 characters (if it contained 1 and was negated, it would - // have been reduced to a Notoneloopatomic), we can use an IndexOfAny to find any of the target characters. + // have been reduced to a Notone), we can use an IndexOfAny to find any of the target characters. // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. // int i = textSpan.Slice(textSpanPos).IndexOfAny(ch1, ch2{, ch3}); @@ -2781,10 +2876,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); - // if (i != -1) goto doneLabel; + // if (i >= 0) goto atomicLoopDoneLabel; Ldloc(iterationLocal); - Ldc(-1); - BneFar(doneLabel); + Ldc(0); + BgeFar(atomicLoopDoneLabel); // i = textSpan.Length - textSpanPos; Ldloca(textSpanLocal); @@ -2796,10 +2891,10 @@ void EmitSingleCharAtomicLoop(RegexNode node) } Stloc(iterationLocal); } - else if (node.Type == RegexNode.Setloopatomic && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) + else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. - // The unbounded constraint is the same as in the Notoneloopatomic case above, done purely for simplicity. + // The unbounded constraint is the same as in the Notone case above, done purely for simplicity. // int i = runtextend - runtextpos; TransferTextSpanPosToRunTextPos(); @@ -2828,39 +2923,37 @@ void EmitSingleCharAtomicLoop(RegexNode node) MarkLabel(bodyLabel); EmitTimeoutCheck(); - // if ((uint)i >= (uint)textSpan.Length) goto doneLabel; + // if ((uint)i >= (uint)textSpan.Length) goto atomicLoopDoneLabel; Ldloc(iterationLocal); Ldloca(textSpanLocal); Call(s_spanGetLengthMethod); - BgeUnFar(doneLabel); + BgeUnFar(atomicLoopDoneLabel); - // if (textSpan[i] != ch) goto Done; + // if (textSpan[i] != ch) goto atomicLoopDoneLabel; Ldloca(textSpanLocal); Ldloc(iterationLocal); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); - BneFar(doneLabel); - break; - case RegexNode.Notoneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); - BeqFar(doneLabel); - break; - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(doneLabel); - break; + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(atomicLoopDoneLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { + BneFar(atomicLoopDoneLabel); + } + else // IsNotoneFamily + { + BeqFar(atomicLoopDoneLabel); + } } // i++; @@ -2869,7 +2962,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) Add(); Stloc(iterationLocal); - // if (i >= maxIterations) goto doneLabel; + // if (i >= maxIterations) goto atomicLoopDoneLabel; MarkLabel(conditionLabel); if (maxIterations != int.MaxValue) { @@ -2884,8 +2977,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) } // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label + MarkLabel(atomicLoopDoneLabel); // Check to ensure we've found at least min iterations. if (minIterations > 0) @@ -2914,10 +3006,6 @@ void EmitSingleCharAtomicLoop(RegexNode node) // Emits the code to handle a non-backtracking optional zero-or-one loop. void EmitAtomicSingleCharZeroOrOne(RegexNode node) { - Debug.Assert( - node.Type == RegexNode.Oneloopatomic || - node.Type == RegexNode.Notoneloopatomic || - node.Type == RegexNode.Setloopatomic); Debug.Assert(node.M == 0 && node.N == 1); Label skipUpdatesLabel = DefineLabel(); @@ -2933,28 +3021,26 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) Ldc(textSpanPos); Call(s_spanGetItemMethod); LdindU2(); - switch (node.Type) + if (node.IsSetFamily) { - case RegexNode.Oneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(skipUpdatesLabel); + } + else + { + if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { BneFar(skipUpdatesLabel); - break; - case RegexNode.Notoneloopatomic: - if (IsCaseInsensitive(node) && RegexCharClass.ParticipatesInCaseConversion(node.Ch)) - { - CallToLower(); - } - Ldc(node.Ch); + } + else // IsNotoneFamily + { BeqFar(skipUpdatesLabel); - break; - case RegexNode.Setloopatomic: - EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); - BrfalseFar(skipUpdatesLabel); - break; + } } // textSpan = textSpan.Slice(1); @@ -2986,11 +3072,12 @@ void EmitAtomicNodeLoop(RegexNode node) return; } - using RentedLocalBuilder iterationLocal = RentInt32Local(); - using RentedLocalBuilder startingRunTextPosLocal = RentInt32Local(); + LocalBuilder iterationLocal = DeclareInt32(); + LocalBuilder startingRunTextPosLocal = DeclareInt32(); Label originalDoneLabel = doneLabel; - doneLabel = DefineLabel(); + Label atomicNodeLabel = DefineLabel(); + doneLabel = atomicNodeLabel; // We might loop any number of times. In order to ensure this loop // and subsequent code sees textSpanPos the same regardless, we always need it to contain @@ -3021,7 +3108,8 @@ void EmitAtomicNodeLoop(RegexNode node) Label successfulIterationLabel = DefineLabel(); Label prevDone = doneLabel; - doneLabel = DefineLabel(); + Label iterationDone = DefineLabel(); + doneLabel = iterationDone; // Save off runtextpos. Ldloc(runtextposLocal); @@ -3035,8 +3123,10 @@ void EmitAtomicNodeLoop(RegexNode node) // If the generated code gets here, the iteration failed. // Reset state, branch to done. - MarkLabel(doneLabel); - doneLabel = prevDone; // reset done label + MarkLabel(iterationDone); + Debug.Assert(doneLabel == iterationDone); + doneLabel = prevDone; + Ldloc(startingRunTextPosLocal); Stloc(runtextposLocal); BrFar(doneLabel); @@ -3064,8 +3154,9 @@ void EmitAtomicNodeLoop(RegexNode node) } // Done: - MarkLabel(doneLabel); - doneLabel = originalDoneLabel; // Restore the original done label + MarkLabel(atomicNodeLabel); + Debug.Assert(doneLabel == atomicNodeLabel); + doneLabel = originalDoneLabel; // Check to ensure we've found at least min iterations. if (minIterations > 0) @@ -3084,8 +3175,8 @@ protected void GenerateGo() _int32LocalsPool?.Clear(); _readOnlySpanCharLocalsPool?.Clear(); - // Generate backtrack-free code when we're dealing with simpler regexes. - if (TryGenerateNonBacktrackingGo(_code.Tree.Root)) + // Generate simpler code when we're dealing with simpler regexes. + if (TryGenerateSimplifiedGo(_code.Tree.Root)) { return; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 282483e8d90657..c23bb50720a05c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -41,6 +41,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Threading; namespace System.Text.RegularExpressions { @@ -103,8 +104,6 @@ internal sealed class RegexNode public const int Testref = 33; // (?(n) | ) - alternation, reference public const int Testgroup = 34; // (?(...) | )- alternation, expression - internal const byte DefaultMaxRecursionDepth = 20; // arbitrary cut-off to avoid unbounded recursion - /// empty bit from the node's options to store data on whether a node contains captures internal const RegexOptions HasCapturesFlag = (RegexOptions)(1 << 31); @@ -309,7 +308,7 @@ internal RegexNode FinalOptimize() // If we find backtracking construct at the end of the regex, we can instead make it non-backtracking, // since nothing would ever backtrack into it anyway. Doing this then makes the construct available // to implementations that don't support backtracking. - EliminateEndingBacktracking(rootNode, DefaultMaxRecursionDepth); + rootNode.EliminateEndingBacktracking(); // Optimization: unnecessary re-processing of starting loops. // If an expression is guaranteed to begin with a single-character unbounded loop that isn't part of an alternation (in which case it @@ -368,27 +367,29 @@ internal RegexNode FinalOptimize() return rootNode; } - /// Converts nodes at the end of the specified node tree to be atomic. + /// Converts nodes at the end of the node tree to be atomic. /// /// The correctness of this optimization depends on nothing being able to backtrack into /// the provided node. That means it must be at the root of the overall expression, or /// it must be an Atomic node that nothing will backtrack into by the very nature of Atomic. /// - private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) + private void EliminateEndingBacktracking() { - if (maxDepth == 0) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { + // If we can't recur further, just stop optimizing. return; } // RegexOptions.NonBacktracking doesn't support atomic groups, so when that option // is set we don't want to create atomic groups where they weren't explicitly authored. - if ((node.Options & RegexOptions.NonBacktracking) != 0) + if ((Options & RegexOptions.NonBacktracking) != 0) { return; } - // Walk the tree starting from the provided node. + // Walk the tree starting from the current node. + RegexNode node = this; while (true) { switch (node.Type) @@ -433,7 +434,7 @@ private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) int branches = node.ChildCount(); for (int i = 1; i < branches; i++) { - EliminateEndingBacktracking(node.Child(i), maxDepth - 1); + node.Child(i).EliminateEndingBacktracking(); } } node = node.Child(0); @@ -444,7 +445,7 @@ private static void EliminateEndingBacktracking(RegexNode node, uint maxDepth) // e.g. (?:abc*)* => (?:ab(?>c*))* case Loop: { - RegexNode? loopDescendent = FindLastExpressionInLoopForAutoAtomic(node, maxDepth - 1); + RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); if (loopDescendent != null) { node = loopDescendent; @@ -601,6 +602,7 @@ private RegexNode ReduceAtomic() // Alternations have a variety of possible optimizations that can be applied // iff they're atomic. case Alternate: + if ((Options & RegexOptions.RightToLeft) == 0) { List? branches = child.Children as List; Debug.Assert(branches is not null && branches.Count != 0); @@ -709,7 +711,7 @@ private RegexNode ReduceAtomic() // For everything else, try to reduce ending backtracking of the last contained expression. default: - EliminateEndingBacktracking(child, DefaultMaxRecursionDepth); + child.EliminateEndingBacktracking(); return atomic; } } @@ -1198,9 +1200,56 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan startingSpan) public char FirstCharOfOneOrMulti() { Debug.Assert(Type is One or Multi); + Debug.Assert((Options & RegexOptions.RightToLeft) == 0); return Type == One ? Ch : Str![0]; } + /// Finds the guaranteed beginning character of the node, or null if none exists. + public char? FindStartingCharacter() + { + RegexNode? node = this; + while (true) + { + if (node is null || (node.Options & RegexOptions.RightToLeft) != 0) + { + return null; + } + + char c; + switch (node.Type) + { + case One: + case Oneloop or Oneloopatomic or Onelazy when node.M > 0: + c = node.Ch; + break; + + case Multi: + c = node.Str![0]; + break; + + case Atomic: + case Concatenate: + case Capture: + case Group: + case Loop or Lazyloop when node.M > 0: + case Require: + node = node.Child(0); + continue; + + default: + return null; + } + + if ((node.Options & RegexOptions.IgnoreCase) == 0 || + !RegexCharClass.ParticipatesInCaseConversion(c)) + { + return c; + } + + return null; + } + } + /// /// Optimizes a concatenation by coalescing adjacent characters and strings, /// coalescing adjacent loops, converting loops to be atomic where applicable, @@ -1467,10 +1516,16 @@ private void ReduceConcatenationWithAutoAtomic() var children = (List)Children; for (int i = 0; i < children.Count - 1; i++) { - ProcessNode(children[i], children[i + 1], DefaultMaxRecursionDepth); + ProcessNode(children[i], children[i + 1]); - static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) + static void ProcessNode(RegexNode node, RegexNode subsequent) { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // If we can't recur further, just stop optimizing. + return; + } + // Skip down the node past irrelevant nodes. while (true) { @@ -1487,7 +1542,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) // compatible for the optimization. if (node.Type == Loop) { - RegexNode? loopDescendent = FindLastExpressionInLoopForAutoAtomic(node, maxDepth - 1); + RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic(); if (loopDescendent != null) { node = loopDescendent; @@ -1502,9 +1557,9 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) // If the node can be changed to atomic based on what comes after it, do so. switch (node.Type) { - case Oneloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): - case Notoneloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): - case Setloop when CanBeMadeAtomic(node, subsequent, maxDepth - 1): + case Oneloop when CanBeMadeAtomic(node, subsequent): + case Notoneloop when CanBeMadeAtomic(node, subsequent): + case Setloop when CanBeMadeAtomic(node, subsequent): node.MakeLoopAtomic(); break; case Alternate: @@ -1518,7 +1573,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) int alternateBranches = node.ChildCount(); for (int b = 0; b < alternateBranches; b++) { - ProcessNode(node.Child(b), subsequent, maxDepth - 1); + ProcessNode(node.Child(b), subsequent); } } break; @@ -1532,8 +1587,10 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) /// that could be made atomic _assuming_ the conditions exist for it with the loop's ancestors. /// /// The found node that should be explored further for auto-atomicity; null if it doesn't exist. - private static RegexNode? FindLastExpressionInLoopForAutoAtomic(RegexNode node, uint maxDepth) + private RegexNode? FindLastExpressionInLoopForAutoAtomic() { + RegexNode node = this; + Debug.Assert(node.Type == Loop); // Start by looking at the loop's sole child. @@ -1555,7 +1612,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) { int concatCount = node.ChildCount(); RegexNode lastConcatChild = node.Child(concatCount - 1); - if (CanBeMadeAtomic(lastConcatChild, node.Child(0), maxDepth - 1)) + if (CanBeMadeAtomic(lastConcatChild, node.Child(0))) { return lastConcatChild; } @@ -1569,11 +1626,11 @@ static void ProcessNode(RegexNode node, RegexNode subsequent, uint maxDepth) /// Determines whether node can be switched to an atomic loop. Subsequent is the node /// immediately after 'node'. /// - private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint maxDepth) + private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent) { - if (maxDepth == 0) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - // We hit our recursion limit. Just don't apply the optimization. + // If we can't recur further, just stop optimizing. return false; } @@ -1609,7 +1666,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint m int childCount = subsequent.ChildCount(); for (int i = 0; i < childCount; i++) { - if (!CanBeMadeAtomic(node, subsequent.Child(i), maxDepth - 1)) + if (!CanBeMadeAtomic(node, subsequent.Child(i))) { return false; } @@ -1697,111 +1754,106 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, uint m /// public int ComputeMinLength() { - return ComputeMinLength(this, DefaultMaxRecursionDepth); - - static int ComputeMinLength(RegexNode node, uint maxDepth) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - if (maxDepth == 0) - { - // Don't examine any further, as we've reached the max allowed depth. - return 0; - } + // If we can't recur further, assume there's no minimum we can enforce. + return 0; + } - switch (node.Type) - { - case One: - case Notone: - case Set: - // Single character. - return 1; + switch (Type) + { + case One: + case Notone: + case Set: + // Single character. + return 1; - case Multi: - // Every character in the string needs to match. - return node.Str!.Length; + case Multi: + // Every character in the string needs to match. + return Str!.Length; - case Notonelazy: - case Notoneloop: - case Notoneloopatomic: - case Onelazy: - case Oneloop: - case Oneloopatomic: - case Setlazy: - case Setloop: - case Setloopatomic: - // One character repeated at least M times. - return node.M; + case Notonelazy: + case Notoneloop: + case Notoneloopatomic: + case Onelazy: + case Oneloop: + case Oneloopatomic: + case Setlazy: + case Setloop: + case Setloopatomic: + // One character repeated at least M times. + return M; - case Lazyloop: - case Loop: - // A node graph repeated at least M times. - return (int)Math.Min(int.MaxValue, (long)node.M * ComputeMinLength(node.Child(0), maxDepth - 1)); + case Lazyloop: + case Loop: + // A node graph repeated at least M times. + return (int)Math.Min(int.MaxValue, (long)M * Child(0).ComputeMinLength()); - case Alternate: - // The minimum required length for any of the alternation's branches. + case Alternate: + // The minimum required length for any of the alternation's branches. + { + int childCount = ChildCount(); + Debug.Assert(childCount >= 2); + int min = Child(0).ComputeMinLength(); + for (int i = 1; i < childCount && min > 0; i++) { - int childCount = node.ChildCount(); - Debug.Assert(childCount >= 2); - int min = ComputeMinLength(node.Child(0), maxDepth - 1); - for (int i = 1; i < childCount && min > 0; i++) - { - min = Math.Min(min, ComputeMinLength(node.Child(i), maxDepth - 1)); - } - return min; + min = Math.Min(min, Child(i).ComputeMinLength()); } + return min; + } - case Concatenate: - // The sum of all of the concatenation's children. + case Concatenate: + // The sum of all of the concatenation's children. + { + long sum = 0; + int childCount = ChildCount(); + for (int i = 0; i < childCount; i++) { - long sum = 0; - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - sum += ComputeMinLength(node.Child(i), maxDepth - 1); - } - return (int)Math.Min(int.MaxValue, sum); + sum += Child(i).ComputeMinLength(); } + return (int)Math.Min(int.MaxValue, sum); + } - case Atomic: - case Capture: - case Group: - // For groups, we just delegate to the sole child. - Debug.Assert(node.ChildCount() == 1); - return ComputeMinLength(node.Child(0), maxDepth - 1); - - case Empty: - case Nothing: - case UpdateBumpalong: - // Nothing to match. In the future, we could potentially use Nothing to say that the min length - // is infinite, but that would require a different structure, as that would only apply if the - // Nothing match is required in all cases (rather than, say, as one branch of an alternation). - case Beginning: - case Bol: - case Boundary: - case ECMABoundary: - case End: - case EndZ: - case Eol: - case NonBoundary: - case NonECMABoundary: - case Start: - // Difficult to glean anything meaningful from boundaries or results only known at run time. - case Prevent: - case Require: - // Lookaheads/behinds could potentially be included in the future, but that will require - // a different structure, as they can't be added as part of a concatenation, since they overlap - // with what comes after. - case Ref: - case Testgroup: - case Testref: - // Constructs requiring data at runtime from the matching pattern can't influence min length. - return 0; + case Atomic: + case Capture: + case Group: + // For groups, we just delegate to the sole child. + Debug.Assert(ChildCount() == 1); + return Child(0).ComputeMinLength(); + + case Empty: + case Nothing: + case UpdateBumpalong: + // Nothing to match. In the future, we could potentially use Nothing to say that the min length + // is infinite, but that would require a different structure, as that would only apply if the + // Nothing match is required in all cases (rather than, say, as one branch of an alternation). + case Beginning: + case Bol: + case Boundary: + case ECMABoundary: + case End: + case EndZ: + case Eol: + case NonBoundary: + case NonECMABoundary: + case Start: + // Difficult to glean anything meaningful from boundaries or results only known at run time. + case Prevent: + case Require: + // Lookaheads/behinds could potentially be included in the future, but that will require + // a different structure, as they can't be added as part of a concatenation, since they overlap + // with what comes after. + case Ref: + case Testgroup: + case Testref: + // Constructs requiring data at runtime from the matching pattern can't influence min length. + return 0; - default: + default: #if DEBUG - Debug.Fail($"Unknown node: {node.TypeName}"); + Debug.Fail($"Unknown node: {TypeName}"); #endif - goto case Empty; - } + goto case Empty; } } @@ -1826,11 +1878,11 @@ public bool TryGetJoinableLengthCheckChildRange(int childIndex, out int required { static bool CanJoinLengthCheck(RegexNode node) => node.Type switch { - RegexNode.One or RegexNode.Notone or RegexNode.Set => true, - RegexNode.Multi => true, - RegexNode.Oneloop or RegexNode.Onelazy or RegexNode.Oneloopatomic or - RegexNode.Notoneloop or RegexNode.Notonelazy or RegexNode.Notoneloopatomic or - RegexNode.Setloop or RegexNode.Setlazy or RegexNode.Setloopatomic when node.M == node.N => true, + One or Notone or Set => true, + Multi => true, + Oneloop or Onelazy or Oneloopatomic or + Notoneloop or Notonelazy or Notoneloopatomic or + Setloop or Setlazy or Setloopatomic when node.M == node.N => true, _ => false, }; @@ -1961,9 +2013,15 @@ public int ChildCount() return 1; } - // Determines whether the node supports an optimized implementation that doesn't allow for backtracking. - internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNode node, int maxDepth) + // Determines whether the node supports an optimized code gen strategy based on walking the node tree. + internal bool SupportsSimplifiedCodeGenerationImplementation() { + if (!StackHelper.TryEnsureSufficientExecutionStack()) + { + // If we can't recur further, simplified code generation isn't supported as the tree is too deep. + return false; + } + bool supported = false; // We only support the default left-to-right, not right-to-left, which requires more complication in the generated code. @@ -1971,97 +2029,117 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod // We also limit the recursion involved to prevent stack dives; this limitation can be removed by switching // away from a recursive implementation (done for convenience) to an iterative one that's more complicated // but within the same problems. - if ((node.Options & RegexOptions.RightToLeft) == 0 && maxDepth > 0) + if ((Options & RegexOptions.RightToLeft) == 0) { - int childCount = node.ChildCount(); - Debug.Assert((node.Options & HasCapturesFlag) == 0); + int childCount = ChildCount(); + Debug.Assert((Options & HasCapturesFlag) == 0); - switch (node.Type) + switch (Type) { // One/Notone/Set/Multi don't involve any repetition and are easily supported. - case RegexNode.One: - case RegexNode.Notone: - case RegexNode.Set: - case RegexNode.Multi: + case One: + case Notone: + case Set: + case Multi: // Boundaries are like set checks and don't involve repetition, either. - case RegexNode.Boundary: - case RegexNode.NonBoundary: - case RegexNode.ECMABoundary: - case RegexNode.NonECMABoundary: + case Boundary: + case NonBoundary: + case ECMABoundary: + case NonECMABoundary: // Anchors are also trivial. - case RegexNode.Beginning: - case RegexNode.Start: - case RegexNode.Bol: - case RegexNode.Eol: - case RegexNode.End: - case RegexNode.EndZ: + case Beginning: + case Start: + case Bol: + case Eol: + case End: + case EndZ: // {Set/One/Notone}loopatomic are optimized nodes that represent non-backtracking variable-length loops. // These consume their {Set/One} inputs as long as they match, and don't give up anything they // matched, which means we can support them without backtracking. - case RegexNode.Oneloopatomic: - case RegexNode.Notoneloopatomic: - case RegexNode.Setloopatomic: + case Oneloopatomic: + case Notoneloopatomic: + case Setloopatomic: // "Empty" is easy: nothing is emitted for it. // "Nothing" is also easy: it doesn't match anything. // "UpdateBumpalong" doesn't match anything, it's just an optional directive to the engine. - case RegexNode.Empty: - case RegexNode.Nothing: - case RegexNode.UpdateBumpalong: + case Empty: + case Nothing: + case UpdateBumpalong: supported = true; break; - // Repeaters don't require backtracking as long as their min and max are equal. - // At that point they're just a shorthand for writing out the One/Notone/Set - // that number of times. - case RegexNode.Oneloop: - case RegexNode.Notoneloop: - case RegexNode.Setloop: - Debug.Assert(node.Next == null || node.Next.Type != RegexNode.Atomic, "Loop should have been transformed into an atomic type."); - goto case RegexNode.Onelazy; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - supported = node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic); + // Single character greedy loops are supported if they're either they're actually a repeater + // or they're not contained in any construct other than simple nesting (e.g. concat, capture). + case Oneloop: + case Notoneloop: + case Setloop: + Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type."); + supported = M == N || AncestorsAllowBacktracking(Next); + static bool AncestorsAllowBacktracking(RegexNode? node) + { + while (node is not null) + { + switch (node.Type) + { + case Concatenate: + case Capture: + case Atomic: + node = node.Next; + break; + + default: + return false; + } + } + + return true; + } + break; + + case Onelazy: + case Notonelazy: + case Setlazy: + supported = M == N || (Next != null && Next.Type == Atomic); break; // {Lazy}Loop repeaters are the same, except their child also needs to be supported. // We also support such loops being atomic. - case RegexNode.Loop: - case RegexNode.Lazyloop: + case Loop: + case Lazyloop: supported = - (node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)) && - NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + (M == N || (Next != null && Next.Type == Atomic)) && + Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; // We can handle atomic as long as we can handle making its child atomic, or // its child doesn't have that concept. - case RegexNode.Atomic: + case Atomic: // Lookahead assertions also only require that the child node be supported. // The RightToLeft check earlier is important to differentiate lookbehind, // which is not supported. - case RegexNode.Require: - case RegexNode.Prevent: - supported = NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + case Require: + case Prevent: + supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; // We can handle alternates as long as they're atomic (a root / global alternate is // effectively atomic, as nothing will try to backtrack into it as it's the last thing). // Its children must all also be supported. - case RegexNode.Alternate: - if (node.Next != null && - (node.IsAtomicByParent() || // atomic alternate - (node.Next.Type == RegexNode.Capture && node.Next.Next is null))) // root alternate + case Alternate: + if (Next != null && + (IsAtomicByParent() || // atomic alternate + (Next.Type == Capture && Next.Next is null))) // root alternate { - goto case RegexNode.Concatenate; + goto case Concatenate; } break; // Concatenation doesn't require backtracking as long as its children don't. - case RegexNode.Concatenate: + case Concatenate: supported = true; for (int i = 0; i < childCount; i++) { - if (supported && !NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(i), maxDepth - 1)) + if (!Child(i).SupportsSimplifiedCodeGenerationImplementation()) { supported = false; break; @@ -2069,22 +2147,22 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod } break; - case RegexNode.Capture: + case Capture: // Currently we only support capnums without uncapnums (for balancing groups) - supported = node.N == -1; + supported = N == -1; if (supported) { // And we only support them in certain places in the tree. - RegexNode? parent = node.Next; + RegexNode? parent = Next; while (parent != null) { switch (parent.Type) { - case RegexNode.Alternate: - case RegexNode.Atomic: - case RegexNode.Capture: - case RegexNode.Concatenate: - case RegexNode.Require: + case Alternate: + case Atomic: + case Capture: + case Concatenate: + case Require: parent = parent.Next; break; @@ -2098,13 +2176,13 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod if (supported) { // And we only support them if their children are supported. - supported = NodeSupportsSimplifiedCodeGenerationImplementation(node.Child(0), maxDepth - 1); + supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); // If we've found a supported capture, mark all of the nodes in its parent // hierarchy as containing a capture. if (supported) { - parent = node; + parent = this; while (parent != null && ((parent.Options & HasCapturesFlag) == 0)) { parent.Options |= HasCapturesFlag; @@ -2117,14 +2195,33 @@ internal static bool NodeSupportsSimplifiedCodeGenerationImplementation(RegexNod } } #if DEBUG - if (!supported && (node.Options & RegexOptions.Debug) != 0) + if (!supported && (Options & RegexOptions.Debug) != 0) { - Debug.WriteLine($"Unable to use non-backtracking code gen: node {node.Description()} isn't supported."); + Debug.WriteLine($"Unable to use non-backtracking code gen: node {Description()} isn't supported."); } #endif return supported; } + /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node. + public bool IsSetFamily => Type is Set or Setloop or Setloopatomic or Setlazy; + + /// Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node. + public bool IsOneFamily => Type is One or Oneloop or Oneloopatomic or Onelazy; + + /// Gets whether the node is a Notone/Notoneloop/Notoneloopatomic/Notonelazy node. + public bool IsNotoneFamily => Type is Notone or Notoneloop or Notoneloopatomic or Notonelazy; + + /// Gets whether this node may be a source of backtracking. + public bool InstigatesBacktracking => + Type switch + { + Oneloop or Notoneloop or Setloop or Onelazy or Notonelazy or Setlazy or Loop or Lazyloop when !IsAtomicByParent() && M != N => true, + Alternate => !IsAtomicByParent(), + Ref or Testref or Testgroup => true, + _ => false, + }; + private string TypeName => Type switch { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 48f610caf303ba..1f4a05afa47c12 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -1659,12 +1659,6 @@ private char ScanControl() throw MakeException(RegexParseError.UnrecognizedControlCharacter, SR.UnrecognizedControlCharacter); } - /// Returns true for options allowed only at the top level - private bool IsOnlyTopOption(RegexOptions options) => - options == RegexOptions.RightToLeft || - options == RegexOptions.CultureInvariant || - options == RegexOptions.ECMAScript; - /// Scans cimsx-cimsx option string, stops at the first unrecognized char. private void ScanOptions() { @@ -1683,7 +1677,7 @@ private void ScanOptions() else { RegexOptions options = OptionFromCode(ch); - if (options == 0 || IsOnlyTopOption(options)) + if (options == 0) { return; } @@ -1804,7 +1798,6 @@ private static RegexOptions OptionFromCode(char ch) return ch switch { 'i' => RegexOptions.IgnoreCase, - 'r' => RegexOptions.RightToLeft, 'm' => RegexOptions.Multiline, 'n' => RegexOptions.ExplicitCapture, 's' => RegexOptions.Singleline, @@ -1812,7 +1805,6 @@ private static RegexOptions OptionFromCode(char ch) #if DEBUG 'd' => RegexOptions.Debug, #endif - 'e' => RegexOptions.ECMAScript, _ => 0, }; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs index 7b88e04f29c969..d183a7eea6e453 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/RegexNodeToSymbolicConverter.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Globalization; using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions.Symbolic { @@ -201,11 +202,9 @@ BDD MapCategoryCodeToCondition(int code) => public SymbolicRegexNode Convert(RegexNode node, bool topLevel) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - RegexNode localNode = node; - bool localTopLevel = topLevel; - return StackHelper.CallOnEmptyStack(() => Convert(localNode, localTopLevel)); + return StackHelper.CallOnEmptyStack(Convert, node, topLevel); } switch (node.Type) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs deleted file mode 100644 index 254c0d5e28dfff..00000000000000 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/StackHelper.cs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Threading; -using System.Threading.Tasks; - -namespace System.Text.RegularExpressions.Symbolic -{ - /// Provides tools for avoiding stack overflows. - internal static class StackHelper - { - // Queues the supplied delegate to the thread pool, then block waiting for it to complete. - // It does so in a way that prevents task inlining (which would defeat the purpose) but that - // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies. - - /// Calls the provided function on the stack of a different thread pool thread. - /// The return type of the function. - /// The function to invoke. - public static T CallOnEmptyStack(Func func) => - Task.Run(func) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); - - /// Calls the provided action on the stack of a different thread pool thread. - /// The action to invoke. - public static void CallOnEmptyStack(Action action) => - Task.Run(action) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); - } -} diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs index fbd1cbda6ee753..3ae62fa919439a 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs @@ -5,6 +5,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; +using System.Threading; namespace System.Text.RegularExpressions.Symbolic { @@ -618,11 +619,10 @@ public SymbolicRegexNode Restrict(S pred) /// public int GetFixedLength() { - // Guard against stack overflow due to deep recursion. - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - SymbolicRegexNode thisRef = this; - return StackHelper.CallOnEmptyStack(() => thisRef.GetFixedLength()); + // If we can't recur further, assume no fixed length. + return -1; } switch (_kind) @@ -690,11 +690,9 @@ public int GetFixedLength() internal SymbolicRegexNode MkDerivative(S elem, uint context) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - S localElem = elem; - uint localContext = context; - return StackHelper.CallOnEmptyStack(() => MkDerivative(localElem, localContext)); + return StackHelper.CallOnEmptyStack(MkDerivative, elem, context); } if (this == _builder._anyStar || this == _builder._nothing) @@ -1100,10 +1098,9 @@ public override string ToString() internal void ToString(StringBuilder sb) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - StringBuilder localSb = sb; - StackHelper.CallOnEmptyStack(() => ToString(localSb)); + StackHelper.CallOnEmptyStack(ToString, sb); return; } @@ -1665,12 +1662,9 @@ private S ComputeStartSet() internal SymbolicRegexNode PruneAnchors(uint prevKind, bool contWithWL, bool contWithNWL) { // Guard against stack overflow due to deep recursion - if (!RuntimeHelpers.TryEnsureSufficientExecutionStack()) + if (!StackHelper.TryEnsureSufficientExecutionStack()) { - uint localPrevKind = prevKind; - bool localContWithWL = contWithWL; - bool localContWithNWL = contWithNWL; - return StackHelper.CallOnEmptyStack(() => PruneAnchors(localPrevKind, localContWithWL, localContWithNWL)); + return StackHelper.CallOnEmptyStack(PruneAnchors, prevKind, contWithWL, contWithNWL); } if (!_info.StartsWithSomeAnchor) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs new file mode 100644 index 00000000000000..1ec05eb7d3d76d --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -0,0 +1,82 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Threading.Tasks; + +namespace System.Threading +{ + /// Provides tools for avoiding stack overflows. + internal static class StackHelper + { + /// Tries to ensure there is sufficient stack to execute the average .NET function. + public static bool TryEnsureSufficientExecutionStack() + { +#if REGEXGENERATOR + try + { + RuntimeHelpers.EnsureSufficientExecutionStack(); + return true; + } + catch + { + return false; + } +#else + return RuntimeHelpers.TryEnsureSufficientExecutionStack(); +#endif + } + + // Queues the supplied delegate to the thread pool, then block waiting for it to complete. + // It does so in a way that prevents task inlining (which would defeat the purpose) but that + // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies. + + /// Calls the provided action on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1) => + Task.Run(() => action(arg1)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided action on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The action to invoke. + /// The first argument to pass to the action. + /// The second argument to pass to the action. + /// The second argument to pass to the action. + public static void CallOnEmptyStack(Action action, TArg1 arg1, TArg2 arg2, TArg3 arg3) => + Task.Run(() => action(arg1, arg2, arg3)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2) => + Task.Run(() => func(arg1, arg2)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + + /// Calls the provided function on the stack of a different thread pool thread. + /// The type of the first argument to pass to the function. + /// The type of the second argument to pass to the function. + /// The type of the third argument to pass to the function. + /// The return type of the function. + /// The function to invoke. + /// The first argument to pass to the function. + /// The second argument to pass to the function. + /// The third argument to pass to the function. + public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3) => + Task.Run(() => func(arg1, arg2, arg3)) + .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) + .GetAwaiter().GetResult(); + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs index 129b2c4bc217f9..6dd56fc109e9b3 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Match.Tests.cs @@ -91,6 +91,10 @@ public static IEnumerable Match_MemberData() // for it to be a success. For a correct match, remove the last character, '3' from the pattern yield return ("[^0-9]+(?>[0-9]+)3", "abc123", RegexOptions.None, 0, 6, false, string.Empty); yield return ("[^0-9]+(?>[0-9]+)", "abc123", RegexOptions.None, 0, 6, true, "abc123"); + + yield return (@"(?!.*a)\w*g", "bcaefg", RegexOptions.None, 0, 6, true, "efg"); + yield return (@"(?!.*a)\w*g", "aaaaag", RegexOptions.None, 0, 6, true, "g"); + yield return (@"(?!.*a)\w*g", "aaaaaa", RegexOptions.None, 0, 6, false, string.Empty); } // More nonbacktracking expressions @@ -175,6 +179,12 @@ public static IEnumerable Match_MemberData() yield return (@".*", "abc", lineOption, 2, 1, true, "c"); } + // Nested loops + if (!RegexHelpers.IsNonBacktracking(engine)) + { + yield return ("a*(?:a[ab]*)*", "aaaababbbbbbabababababaaabbb", RegexOptions.None, 0, 28, true, "aaaa"); + } + // Using beginning/end of string chars \A, \Z: Actual - "\\Aaaa\\w+zzz\\Z" yield return (@"\Aaaa\w+zzz\Z", "aaaasdfajsdlfjzzz", RegexOptions.IgnoreCase, 0, 17, true, "aaaasdfajsdlfjzzz"); yield return (@"\Aaaaaa\w+zzz\Z", "aaaa", RegexOptions.IgnoreCase, 0, 4, false, string.Empty); @@ -344,7 +354,13 @@ public static IEnumerable Match_MemberData() yield return ("(?(cat)dog1|dog2)", "catdog1", RegexOptions.None, 0, 7, false, string.Empty); yield return ("(?(cat)dog1|dog2)", "catdog2", RegexOptions.None, 0, 7, true, "dog2"); yield return ("(?(cat)dog1|dog2)", "catdog1dog2", RegexOptions.None, 0, 11, true, "dog2"); + yield return (@"(\w+|\d+)a+[ab]+", "123123aa", RegexOptions.None, 0, 8, true, "123123aa"); + yield return ("(a|ab|abc|abcd)d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); + yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.None, 0, 4, false, string.Empty); + yield return ("(?>(?:a|ab|abc|abcd))d", "abcd", RegexOptions.RightToLeft, 0, 4, true, "abcd"); } + yield return ("[^a-z0-9]etag|[^a-z0-9]digest", "this string has .digest as a substring", RegexOptions.None, 16, 7, true, ".digest"); + yield return (@"a\w*a|def", "aaaaa", RegexOptions.None, 0, 5, true, "aaaaa"); // No Negation yield return ("[abcd-[abcd]]+", "abcxyzABCXYZ`!@#$%^&*()_-+= \t\n", RegexOptions.None, 0, 30, false, string.Empty); @@ -1584,7 +1600,7 @@ public static IEnumerable AllMatches_TestData() }; // Case insensitive cases by using ?i and some non-ASCII characters like Kelvin sign and applying ?i over negated character classes - yield return new object[] { engine, "(?i:[a-dÕ]+k*)", RegexOptions.None, "xyxaBõc\u212AKAyy", new (int, int, string)[] { (3, 6, "aBõc\u212AK"), (9, 1, "A") } }; + yield return new object[] { engine, "(?i:[a-d\u00D5]+k*)", RegexOptions.None, "xyxaB\u00F5c\u212AKAyy", new (int, int, string)[] { (3, 6, "aB\u00F5c\u212AK"), (9, 1, "A") } }; yield return new object[] { engine, "(?i:[a-d]+)", RegexOptions.None, "xyxaBcyy", new (int, int, string)[] { (3, 3, "aBc") } }; yield return new object[] { engine, "(?i:[\0-@B-\uFFFF]+)", RegexOptions.None, "xaAaAy", new (int, int, string)[] { (0, 6, "xaAaAy") } }; // this is the same as .+ yield return new object[] { engine, "(?i:[\0-ac-\uFFFF])", RegexOptions.None, "b", new (int, int, string)[] { (0, 1, "b") } }; diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs index 3b6bfcc0afe8f3..47ece73defcc76 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs @@ -61,33 +61,29 @@ internal static async Task SourceGenRegexAsync( var code = new StringBuilder(); code.AppendLine("using System.Text.RegularExpressions;"); + code.AppendLine("public partial class C {"); // Build up the code for all of the regexes int count = 0; foreach (var regex in regexes) { Assert.True(regex.options is not null || regex.matchTimeout is null); - string attr = $"[RegexGenerator({SymbolDisplay.FormatLiteral(regex.pattern, quote: true)}"; + code.Append($" [RegexGenerator({SymbolDisplay.FormatLiteral(regex.pattern, quote: true)}"); if (regex.options is not null) { - attr += $", {string.Join(" | ", regex.options.ToString().Split(',').Select(o => $"RegexOptions.{o.Trim()}"))}"; + code.Append($", {string.Join(" | ", regex.options.ToString().Split(',').Select(o => $"RegexOptions.{o.Trim()}"))}"); if (regex.matchTimeout is not null) { - attr += string.Create(CultureInfo.InvariantCulture, $", {(int)regex.matchTimeout.Value.TotalMilliseconds}"); + code.Append(string.Create(CultureInfo.InvariantCulture, $", {(int)regex.matchTimeout.Value.TotalMilliseconds}")); } } - attr += ")]"; - - // Create the source boilerplate for the pattern - code.AppendLine($@"public partial class C - {{ - {attr} - public static partial Regex Get{count}(); - }}"); + code.AppendLine($")] public static partial Regex Get{count}();"); count++; } + code.AppendLine("}"); + // Use a cached compilation to save a little time. Rather than creating an entirely new workspace // for each test, just create a single compilation, cache it, and then replace its syntax tree // on each test. diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index 49876011b281c1..ad5ca8d0754d98 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -500,15 +500,26 @@ public void PatternsReduceDifferently(string pattern1, string pattern2) [InlineData(@"abcd(?