diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index b57aaedda9e2c2..06b8b23105f99c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -893,10 +893,7 @@ private RegexNode ReduceLoops() // If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone, // reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will - // generally have only produced the latter, but other reductions could have exposed - // this. We can also reduce or eliminate certain loops that are nops, e.g. - // a loop with a minimum of 0 that wraps a zero-width assertion is either asserting something - // or not, and is thus useless. + // generally have only produced the latter, but other reductions could have exposed this. if (u.ChildCount() == 1) { RegexNode child = u.Child(0); @@ -910,14 +907,27 @@ private RegexNode ReduceLoops() break; case RegexNodeKind.Empty: - case RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or + // A loop around an empty is itself empty, regardless of iteration counts. + u = child; + break; + + case RegexNodeKind.PositiveLookaround when ContainsKind(child, [RegexNodeKind.Capture]) is false: + case RegexNodeKind.NegativeLookaround or RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol or RegexNodeKind.Eol or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or - RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary - when u.M == 0: - u = new RegexNode(RegexNodeKind.Empty, Options); + RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary: + // A loop around (most) zero-width assertions can also be reduced. If it has a lower bound of 0, + // then it's either asserting something or not, and is thus useless and replaceable by empty. + // If it has a lower bound > 0, then the contents are still needed, but the loop isn't, since + // it's non-consuming and thus any more repetitions than 1 are redundant. The one zero-width assertion + // that can't be handled in this way is a PositiveLookaround, because it might contain capture groups + // with captures that must persist past the lookaround (in contrast, negative lookarounds undo all + // captures); if it were to be removed, it could affect both subsequent backreferences as well as access + // to capture information in the resulting Match. Thus, we can only transform a PositiveLookaround in + // this manner if it doesn't contain any captures. + u = u.M == 0 ? new RegexNode(RegexNodeKind.Empty, Options) : child; break; } } @@ -2058,7 +2068,7 @@ private RegexNode ReduceLookaround() // Captures inside of negative lookarounds are undone after the lookaround. Thus, if there's nothing // inside of the negative lookaround that needs that capture group (namely a backreference), we can // remove the capture. - if (Kind is RegexNodeKind.NegativeLookaround && ContainsBackreference(Child(0)) is false) + if (Kind is RegexNodeKind.NegativeLookaround && ContainsKind(Child(0), [RegexNodeKind.Backreference, RegexNodeKind.BackreferenceConditional]) is false) { if (RemoveCaptures(this, 0)) { @@ -2131,26 +2141,32 @@ RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol or RegexNodeKind.Eol or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or - RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary; + RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary or + RegexNodeKind.UpdateBumpalong; - /// Gets whether the node contains a backreference anywhere in its tree. - private static bool? ContainsBackreference(RegexNode node) + /// Gets whether the node contains any of the specified kinds anywhere in its tree. + /// if it does, if it does't, and if it can't be determined. + private static bool? ContainsKind(RegexNode node, ReadOnlySpan kinds) { - if (node.Kind is RegexNodeKind.Backreference or RegexNodeKind.BackreferenceConditional) + foreach (RegexNodeKind kind in kinds) { - return true; + if (node.Kind == kind) + { + return true; + } } if (!StackHelper.TryEnsureSufficientExecutionStack()) { - // If we can't recur further, just stop optimizing. + // If we can't recur further, just stop optimizing. We need to return null to signal + // that the result can't be trusted. return null; } int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { - if (ContainsBackreference(node.Child(i)) is true) + if (ContainsKind(node.Child(i), kinds) is true) { return true; } @@ -2787,25 +2803,10 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil // Skip over empty nodes, as they're pure nops. They would ideally have been optimized away, // but can still remain in some situations. } - else if (consumeZeroWidthNodes && - // anchors - child.Kind is RegexNodeKind.Beginning or - RegexNodeKind.Bol or - RegexNodeKind.Start or - // boundaries - RegexNodeKind.Boundary or - RegexNodeKind.ECMABoundary or - RegexNodeKind.NonBoundary or - RegexNodeKind.NonECMABoundary or - // lookarounds - RegexNodeKind.NegativeLookaround or - RegexNodeKind.PositiveLookaround or - // logic - RegexNodeKind.UpdateBumpalong) + else if (consumeZeroWidthNodes && IsZeroWidthAssertion(child.Kind)) { - // Skip over zero-width nodes that might be reasonable at the beginning of or within a substring. - // We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that - // may not fully represent the semantics of this portion of the pattern. + // Skip over zero-width nodes. We can only do these if consumeZeroWidthNodes is true, as otherwise we'd + // be producing a string that may not fully represent the semantics of this portion of the pattern. } else { diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index b720936b59b995..914057be52d120 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -93,6 +93,9 @@ public static IEnumerable Match_MemberData() yield return (@"(?:(?!(b)b)\1a)*", "babababa", RegexOptions.None, 0, 8, true, string.Empty); yield return (@"(.*?)a(?!(a+)b\2c)\2(.*)", "baaabaac", RegexOptions.None, 0, 8, false, string.Empty); yield return (@"(?!(abc))+\w\w\w", "abcdef", RegexOptions.None, 0, 6, true, "bcd"); + yield return (@"(?=(abc))?\1", "abc", RegexOptions.None, 0, 3, true, "abc"); + yield return (@"(?=(abc))+\1", "abc", RegexOptions.None, 0, 3, true, "abc"); + yield return (@"(?=(abc))*\1", "abc", RegexOptions.None, 0, 3, true, "abc"); // Zero-width positive lookbehind assertion yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef"); diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs index 15d7855cbb2bcf..b6bff3399099d9 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs @@ -279,6 +279,15 @@ public class RegexReductionTests [InlineData("(?!(abc))", "(?!abc)")] [InlineData("(?!a(b*)c)", "(?!ab*c)")] [InlineData("(?!a((((b))))c)", "(?!abc)")] + [InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")] + [InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")] + [InlineData(@"(?=\G)abc", @"\Gabc")] + [InlineData(@"(?=^)abc", @"^abc")] + [InlineData(@"(?=\b)abc", @"\babc")] + [InlineData(@"abc(?=\z)", @"abc\z")] + [InlineData(@"abc(?=\Z)", @"abc\Z")] + [InlineData(@"abc(?=\A)", @"abc\A")] + [InlineData(@"abc(?=$)", @"abc$")] // Alternation reduction [InlineData("a|b", "[ab]")] [InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")] @@ -409,16 +418,6 @@ public class RegexReductionTests [InlineData(@"\z\z", @"\z")] [InlineData(@"\G\G", @"\G")] [InlineData(@"\A\A", @"\A")] - // Lookarounds - [InlineData(@"(?=^)abc", @"^abc")] - [InlineData(@"(?=\G)abc", @"\Gabc")] - [InlineData(@"abc(?=$)", @"abc$")] - [InlineData(@"(?=\b)abc", @"\babc")] - [InlineData(@"abc(?=\z)", @"abc\z")] - [InlineData(@"abc(?=\Z)", @"abc\Z")] - [InlineData(@"abc(?=\A)", @"abc\A")] - [InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")] - [InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")] // Nothing handling [InlineData(@"\wabc(?!)def", "(?!)")] [InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]