Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -936,15 +936,29 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
}

int operand0 = Operand(0);
string set = _code.Strings[operand0];
ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];

while (c-- > 0)
if (!_rightToLeft &&
_code.StringsSetSearchValues[operand0] is RegexInterpreterCode.SetSearchValues setSearchValues)
{
if (!RegexCharClass.CharInClass(Forwardcharnext(inputSpan), set, ref setLookup))
if (!setSearchValues.AllMatch(inputSpan.Slice(runtextpos, c)))
{
goto BreakBackward;
}

runtextpos += c;
}
else
{
string set = _code.Strings[operand0];
ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];

while (c-- > 0)
{
if (!RegexCharClass.CharInClass(Forwardcharnext(inputSpan), set, ref setLookup))
{
goto BreakBackward;
}
}
}
}
advance = 2;
Expand Down Expand Up @@ -1022,16 +1036,35 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
{
int len = Math.Min(Operand(1), Forwardchars());
int operand0 = Operand(0);
string set = _code.Strings[operand0];
ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];
int i;

for (i = len; i > 0; i--)
if (!_rightToLeft &&
_code.StringsSetSearchValues[operand0] is RegexInterpreterCode.SetSearchValues setSearchValues)
{
if (!RegexCharClass.CharInClass(Forwardcharnext(inputSpan), set, ref setLookup))
int idx = setSearchValues.IndexOfAnyNonMatch(inputSpan.Slice(runtextpos, len));
if (idx == -1)
{
Backwardnext();
break;
runtextpos += len;
i = 0;
}
else
{
runtextpos += idx;
i = len - idx;
}
}
else
{
string set = _code.Strings[operand0];
ref uint[]? setLookup = ref _code.StringsAsciiLookup[operand0];

for (i = len; i > 0; i--)
{
if (!RegexCharClass.CharInClass(Forwardcharnext(inputSpan), set, ref setLookup))
{
Backwardnext();
break;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;

Expand All @@ -19,9 +20,61 @@ internal sealed class RegexInterpreterCode(RegexFindOptimizations findOptimizati
public readonly string[] Strings = strings;
/// <summary>ASCII lookup table optimization for sets in <see cref="Strings"/>.</summary>
public readonly uint[]?[] StringsAsciiLookup = new uint[strings.Length][];
/// <summary>Precomputed set matchers for character class strings in <see cref="Strings"/>, enabling vectorized scanning for Set opcodes.</summary>
public readonly SetSearchValues?[] StringsSetSearchValues = CreateSetSearchValues(strings);
/// <summary>How many instructions in <see cref="Codes"/> use backtracking.</summary>
public readonly int TrackCount = trackcount;

/// <summary>Tries to create SetSearchValues for each character class string.</summary>
private static SetSearchValues?[] CreateSetSearchValues(string[] strings)
{
var result = new SetSearchValues?[strings.Length];
Span<char> chars = stackalloc char[128];

for (int i = 0; i < strings.Length; i++)
{
string set = strings[i];

// The Strings table contains both character class strings and Multi literal strings.
// Character class strings have a flags byte of 0 (not negated) or 1 (negated) at index 0,
// followed by set length and category length. Validate the encoding before calling GetSetChars,
// which assumes a well-formed char-class string and could otherwise throw on arbitrary input.
if (set.Length >= RegexCharClass.SetStartIndex &&
set[RegexCharClass.FlagsIndex] is '\0' or '\u0001' &&
RegexCharClass.SetStartIndex + set[RegexCharClass.SetLengthIndex] + set[RegexCharClass.CategoryLengthIndex] <= set.Length)
{
// GetSetChars returns the characters that back the set. For a negated set, these are
// the characters excluded from the class; the separate IsNegated flag indicates how
// to interpret them. If the set uses Unicode categories or has too many chars, it returns 0.
int count = RegexCharClass.GetSetChars(set, chars);
if (count > 0)
{
result[i] = new SetSearchValues(
SearchValues.Create(chars.Slice(0, count)),
RegexCharClass.IsNegated(set));
}
Comment thread
danmoseley marked this conversation as resolved.
}
}

return result;
}

/// <summary>Wraps a <see cref="SearchValues{Char}"/> with the set's negation flag so the interpreter
/// can test "is character in class" without knowing whether the class was defined as negated.</summary>
internal readonly struct SetSearchValues(SearchValues<char> values, bool negated)
{
private readonly SearchValues<char> _values = values;
private readonly bool _negated = negated;

/// <summary>Returns true if all characters in <paramref name="span"/> are in the character class.</summary>
public bool AllMatch(ReadOnlySpan<char> span) =>
_negated ? !span.ContainsAny(_values) : !span.ContainsAnyExcept(_values);

/// <summary>Returns the index of the first character not in the character class, or -1 if all match.</summary>
public int IndexOfAnyNonMatch(ReadOnlySpan<char> span) =>
_negated ? span.IndexOfAny(_values) : span.IndexOfAnyExcept(_values);
}

/// <summary>Gets whether the specified opcode may incur backtracking.</summary>
public static bool OpcodeBacktracks(RegexOpcode opcode)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3231,5 +3231,20 @@ public static IEnumerable<object[]> BalancingGroup_Various_MemberData()
}
}
}

[Fact]
public void Match_MultiLiteralResemblingCharClassEncoding()
{
// Regression test: a Multi literal string that happens to resemble a character class
// encoding could cause GetSetChars to throw IndexOutOfRangeException.
// The pattern below produces a Multi opcode whose literal string has:
// [0]='\0' (looks like non-negated flag), [1]='\u0002' (even "set length"),
// [2]='\0' (no categories), [3]='X' — but no [4], so the range enumeration
// in GetSetChars would access past the end of the string.
// CreateSetSearchValues must validate the char-class encoding before calling GetSetChars.
string input = "\x00\x02\x00X";
var regex = new Regex(input);
Assert.True(regex.IsMatch(input));
}
}
}
Loading