@@ -36,7 +36,6 @@ public partial class RegexGenerator
3636 "#nullable enable" ,
3737 "#pragma warning disable CS0162 // Unreachable code" ,
3838 "#pragma warning disable CS0164 // Unreferenced label" ,
39- "#pragma warning disable CS0168 // Variable declared but never used" ,
4039 "#pragma warning disable CS0219 // Variable assigned but never used" ,
4140 "" ,
4241 } ;
@@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
274273 bool hasTextInfo = false ;
275274
276275 // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
277- // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
278- // delete it if no additional declarations are required, or we replace it with the list of additional declarations
279- // built up while generating code.
276+ // To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
277+ // and then insert them at that position once everything else has been output.
280278 var additionalDeclarations = new HashSet < string > ( ) ;
281279
282280 // Emit locals initialization
283- writer . WriteLine ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
284281 writer . WriteLine ( "int pos = base.runtextpos, end = base.runtextend;" ) ;
285282 writer . Flush ( ) ;
286283 int additionalDeclarationsPosition = ( ( StringWriter ) writer . InnerWriter ) . GetStringBuilder ( ) . Length ;
@@ -315,15 +312,17 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
315312 {
316313 case FindNextStartingPositionMode . LeadingPrefix_LeftToRight_CaseSensitive :
317314 Debug . Assert ( ! string . IsNullOrEmpty ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ) ;
318- EmitIndexOf_LeftToRight ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ;
315+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
316+ EmitIndexOf ( code . FindOptimizations . LeadingCaseSensitivePrefix ) ;
319317 break ;
320318
321319 case FindNextStartingPositionMode . FixedSets_LeftToRight_CaseSensitive :
322320 case FindNextStartingPositionMode . FixedSets_LeftToRight_CaseInsensitive :
323321 case FindNextStartingPositionMode . LeadingSet_LeftToRight_CaseSensitive :
324322 case FindNextStartingPositionMode . LeadingSet_LeftToRight_CaseInsensitive :
325323 Debug . Assert ( code . FindOptimizations . FixedDistanceSets is { Count : > 0 } ) ;
326- EmitFixedSet_LeftToRight ( ) ;
324+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
325+ EmitFixedSet ( ) ;
327326 break ;
328327
329328 default :
@@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
338337 }
339338 writer . WriteLine ( ) ;
340339
341- writer . WriteLine ( "// No match " ) ;
340+ writer . WriteLine ( "// No starting position found " ) ;
342341 writer . WriteLine ( "ReturnFalse:" ) ;
343342 writer . WriteLine ( "base.runtextpos = end;" ) ;
344343 writer . WriteLine ( "return false;" ) ;
@@ -368,8 +367,7 @@ bool EmitAnchors()
368367
369368 case RegexPrefixAnalyzer . Start :
370369 writer . WriteLine ( "// Start \\ G anchor" ) ;
371- additionalDeclarations . Add ( "int start = base.runtextstart;" ) ;
372- using ( EmitBlock ( writer , "if (pos > start)" ) )
370+ using ( EmitBlock ( writer , "if (pos > base.runtextstart)" ) )
373371 {
374372 writer . WriteLine ( "goto ReturnFalse;" ) ;
375373 }
@@ -400,6 +398,7 @@ bool EmitAnchors()
400398 // the other anchors, which all skip all subsequent processing if found, with BOL we just use it
401399 // to boost our position to the next line, and then continue normally with any searches.
402400 writer . WriteLine ( "// Beginning-of-line anchor" ) ;
401+ additionalDeclarations . Add ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
403402 additionalDeclarations . Add ( "int beginning = base.runtextbeg;" ) ;
404403 using ( EmitBlock ( writer , "if (pos > beginning && inputSpan[pos - 1] != '\\ n')" ) )
405404 {
@@ -418,8 +417,8 @@ bool EmitAnchors()
418417 return false ;
419418 }
420419
421- // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
422- void EmitIndexOf_LeftToRight ( string prefix )
420+ // Emits a case-sensitive prefix search for a string at the beginning of the pattern.
421+ void EmitIndexOf ( string prefix )
423422 {
424423 writer . WriteLine ( $ "int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), { Literal ( prefix ) } );") ;
425424 writer . WriteLine ( "if (i >= 0)" ) ;
@@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix)
429428 writer . WriteLine ( "}" ) ;
430429 }
431430
432- // Emits a left-to-right search for a set at a fixed position from the start of the pattern,
431+ // Emits a search for a set at a fixed position from the start of the pattern,
433432 // and potentially other sets at other fixed positions in the pattern.
434- void EmitFixedSet_LeftToRight ( )
433+ void EmitFixedSet ( )
435434 {
436435 List < ( char [ ] ? Chars , string Set , int Distance , bool CaseInsensitive ) > ? sets = code . FindOptimizations . FixedDistanceSets ;
437436 ( char [ ] ? Chars , string Set , int Distance , bool CaseInsensitive ) primarySet = sets ! [ 0 ] ;
@@ -600,15 +599,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
600599
601600 RegexOptions options = ( RegexOptions ) rm . Options ;
602601 RegexCode code = rm . Code ;
603- bool hasTimeout = false ;
604602
605603 // Helper to define names. Names start unadorned, but as soon as there's repetition,
606604 // they begin to have a numbered suffix.
607605 var usedNames = new Dictionary < string , int > ( ) ;
608606
609607 // Every RegexTree is rooted in the implicit Capture for the whole expression.
610608 // Skip the Capture node. We handle the implicit root capture specially.
611- RegexNode node = rm . Code . Tree . Root ;
609+ RegexNode node = code . Tree . Root ;
612610 Debug . Assert ( node . Type == RegexNode . Capture , "Every generated tree should begin with a capture node" ) ;
613611 Debug . Assert ( node . ChildCount ( ) == 1 , "Capture nodes should have one child" ) ;
614612 node = node . Child ( 0 ) ;
@@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
635633 }
636634
637635 // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
638- // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
639- // delete it if no additional declarations are required, or we replace it with the list of additional declarations
640- // built up while generating code.
636+ // To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
637+ // and then insert them at that position once everything else has been output.
641638 var additionalDeclarations = new HashSet < string > ( ) ;
642639 var additionalLocalFunctions = new Dictionary < string , string [ ] > ( ) ;
643640
@@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
646643 writer . WriteLine ( "global::System.ReadOnlySpan<char> inputSpan = base.runtext;" ) ;
647644 writer . WriteLine ( "int pos = base.runtextpos, end = base.runtextend;" ) ;
648645 writer . WriteLine ( $ "int original_pos = pos;") ;
649- hasTimeout = EmitLoopTimeoutCounterIfNeeded ( writer , rm ) ;
646+ bool hasTimeout = EmitLoopTimeoutCounterIfNeeded ( writer , rm ) ;
647+ bool hasTextInfo = EmitInitializeCultureForGoIfNecessary ( writer , rm ) ;
650648 writer . Flush ( ) ;
651649 int additionalDeclarationsPosition = ( ( StringWriter ) writer . InnerWriter ) . GetStringBuilder ( ) . Length ;
652650 int additionalDeclarationsIndent = writer . Indent ;
653- writer . WriteLine ( ) ;
654-
655- // TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
656- bool hasTextInfo = EmitInitializeCultureForGoIfNecessary ( writer , rm ) ;
657651
658652 // The implementation tries to use const indexes into the span wherever possible, which we can do
659653 // for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
@@ -936,8 +930,7 @@ void EmitAllBranches()
936930
937931 // Save off pos. We'll need to reset this each time a branch fails.
938932 string startingPos = ReserveName ( "alternation_starting_pos" ) ;
939- additionalDeclarations . Add ( $ "int { startingPos } = 0;") ;
940- writer . WriteLine ( $ "{ startingPos } = pos;") ;
933+ writer . WriteLine ( $ "int { startingPos } = pos;") ;
941934 int startingSliceStaticPos = sliceStaticPos ;
942935
943936 // We need to be able to undo captures in two situations:
@@ -964,8 +957,7 @@ void EmitAllBranches()
964957 if ( expressionHasCaptures && ( ( node . Options & RegexNode . HasCapturesFlag ) != 0 || ! isAtomic ) )
965958 {
966959 startingCapturePos = ReserveName ( "alternation_starting_capturepos" ) ;
967- additionalDeclarations . Add ( $ "int { startingCapturePos } = 0;") ;
968- writer . WriteLine ( $ "{ startingCapturePos } = base.Crawlpos();") ;
960+ writer . WriteLine ( $ "int { startingCapturePos } = base.Crawlpos();") ;
969961 }
970962 writer . WriteLine ( ) ;
971963
@@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node)
12111203 // to backtrack to. So, we expose a single Backtrack label and track which branch was
12121204 // followed in this resumeAt local.
12131205 string resumeAt = ReserveName ( "conditionalbackreference_branch" ) ;
1214- additionalDeclarations . Add ( $ "int { resumeAt } = 0;") ;
1206+ writer . WriteLine ( $ "int { resumeAt } = 0;") ;
12151207
12161208 // While it would be nicely readable to use an if/else block, if the branches contain
12171209 // anything that triggers backtracking, labels will end up being defined, and if they're
@@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node)
13401332 {
13411333 startingCapturePos = ReserveName ( "conditionalexpression_starting_capturepos" ) ;
13421334 writer . WriteLine ( $ "int { startingCapturePos } = base.Crawlpos();") ;
1343- writer . WriteLine ( ) ;
1335+ }
1336+
1337+ string resumeAt = ReserveName ( "conditionalexpression_resumeAt" ) ;
1338+ if ( ! isAtomic )
1339+ {
1340+ writer . WriteLine ( $ "int { resumeAt } = 0;") ;
13441341 }
13451342
13461343 // Emit the conditional expression. We need to reroute any match failures to either the "no" branch
@@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node)
13531350 {
13541351 doneLabel = originalDoneLabel ;
13551352 }
1356-
13571353 string postConditionalDoneLabel = doneLabel ;
1358- string resumeAt = ReserveName ( "conditionalexpression_resumeAt" ) ;
1359- if ( ! isAtomic )
1360- {
1361- additionalDeclarations . Add ( $ "int { resumeAt } = 0;") ;
1362- }
13631354
13641355 // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
13651356 // Since the "yes" branch may have a different execution path than the "no" branch or the lack of
@@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node)
13701361 writer . WriteLine ( ) ;
13711362 TransferSliceStaticPosToPos ( ) ; // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
13721363 string postYesDoneLabel = doneLabel ;
1373- if ( ! isAtomic && postYesDoneLabel != originalDoneLabel )
1374- {
1375- writer . WriteLine ( $ "{ resumeAt } = 0;") ;
1376- }
13771364 if ( postYesDoneLabel != originalDoneLabel || noBranch is not null )
13781365 {
13791366 writer . WriteLine ( $ "goto { end } ;") ;
@@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)
14671454
14681455 TransferSliceStaticPosToPos ( ) ;
14691456 string startingPos = ReserveName ( "capture_starting_pos" ) ;
1470- additionalDeclarations . Add ( $ "int { startingPos } = 0;") ;
1471- writer . WriteLine ( $ "{ startingPos } = pos;") ;
1457+ writer . WriteLine ( $ "int { startingPos } = pos;") ;
14721458 writer . WriteLine ( ) ;
14731459
14741460 RegexNode child = node . Child ( 0 ) ;
@@ -1604,7 +1590,8 @@ node.Type is RegexNode.Atomic or // atomic nodes by definition don't give up any
16041590 RegexNode . Oneloopatomic or RegexNode . Notoneloopatomic or RegexNode . Setloopatomic or // same for atomic loops
16051591 RegexNode . One or RegexNode . Notone or RegexNode . Set or // individual characters don't backtrack
16061592 RegexNode . Multi or // multiple characters don't backtrack
1607- RegexNode . Beginning or RegexNode . Start or RegexNode . End or RegexNode . EndZ or RegexNode . Boundary or RegexNode . NonBoundary or RegexNode . ECMABoundary or RegexNode . NonECMABoundary or // anchors don't backtrack
1593+ RegexNode . Ref or // backreferences don't backtrack
1594+ RegexNode . Beginning or RegexNode . Bol or RegexNode . Start or RegexNode . End or RegexNode . EndZ or RegexNode . Eol or RegexNode . Boundary or RegexNode . NonBoundary or RegexNode . ECMABoundary or RegexNode . NonECMABoundary or // anchors don't backtrack
16081595 RegexNode . Nothing or RegexNode . Empty or RegexNode . UpdateBumpalong // empty/nothing don't do anything
16091596 // Fixed-size repeaters of single characters or atomic don't backtrack
16101597 || node . Type is RegexNode . Oneloop or RegexNode . Notoneloop or RegexNode . Setloop or RegexNode . Onelazy or RegexNode . Notonelazy or RegexNode . Setlazy && node . M == node . N
@@ -1965,26 +1952,30 @@ void EmitAnchors(RegexNode node)
19651952 break ;
19661953
19671954 case RegexNode . End :
1968- using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Length > { sliceStaticPos } )") )
1955+ using ( EmitBlock ( writer , $ "if ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } )") )
19691956 {
19701957 writer . WriteLine ( $ "goto { doneLabel } ;") ;
19711958 }
19721959 break ;
19731960
19741961 case RegexNode . EndZ :
1975- writer . WriteLine ( $ "if ({ sliceSpan } .Length - 1 > { sliceStaticPos } || ({ sliceSpan } .Length > { sliceStaticPos } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n'))") ;
1962+ writer . WriteLine ( $ "if ({ sliceSpan } .Length - 1 > { sliceStaticPos } || ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n'))") ;
19761963 using ( EmitBlock ( writer , null ) )
19771964 {
19781965 writer . WriteLine ( $ "goto { doneLabel } ;") ;
19791966 }
19801967 break ;
19811968
19821969 case RegexNode . Eol :
1983- using ( EmitBlock ( writer , $ "if ({ sliceSpan } .Length > { sliceStaticPos } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n')") )
1970+ using ( EmitBlock ( writer , $ "if ({ IsSliceLengthGreaterThanSliceStaticPos ( ) } && { sliceSpan } [{ sliceStaticPos } ] != '\\ n')") )
19841971 {
19851972 writer . WriteLine ( $ "goto { doneLabel } ;") ;
19861973 }
19871974 break ;
1975+
1976+ string IsSliceLengthGreaterThanSliceStaticPos ( ) =>
1977+ sliceStaticPos == 0 ? $ "!{ sliceSpan } .IsEmpty" :
1978+ $ "{ sliceSpan } .Length > { sliceStaticPos } ";
19881979 }
19891980 }
19901981
@@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
22222213 maxIterations = $ "{ node . N - node . M } ";
22232214
22242215 iterationCount = ReserveName ( "lazyloop_iteration" ) ;
2225- additionalDeclarations . Add ( $ "int { iterationCount } = 0;") ;
2226- writer . WriteLine ( $ "{ iterationCount } = 0;") ;
2216+ writer . WriteLine ( $ "int { iterationCount } = 0;") ;
22272217 }
22282218
22292219 // Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
@@ -2366,18 +2356,15 @@ void EmitLazy(RegexNode node)
23662356 string body = ReserveName ( "LazyLoopBody" ) ;
23672357 string endLoop = ReserveName ( "LazyLoopEnd" ) ;
23682358
2369- additionalDeclarations . Add ( $ "int { iterationCount } = 0, { startingPos } = 0, { sawEmpty } = 0;") ;
2370- writer . WriteLine ( $ "{ iterationCount } = 0;") ;
2371- writer . WriteLine ( $ "{ startingPos } = pos;") ;
2372- writer . WriteLine ( $ "{ sawEmpty } = 0;") ;
2373- writer . WriteLine ( ) ;
2359+ writer . WriteLine ( $ "int { iterationCount } = 0, { startingPos } = pos, { sawEmpty } = 0;") ;
23742360
23752361 // If the min count is 0, start out by jumping right to what's after the loop. Backtracking
23762362 // will then bring us back in to do further iterations.
23772363 if ( minIterations == 0 )
23782364 {
23792365 writer . WriteLine ( $ "goto { endLoop } ;") ;
23802366 }
2367+ writer . WriteLine ( ) ;
23812368
23822369 // Iteration body
23832370 MarkLabel ( body , emitSemicolon : false ) ;
@@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has
32793266 {
32803267 if ( declarations . Count != 0 )
32813268 {
3282- StringBuilder tmp = new StringBuilder ( ) . AppendLine ( ) ;
3269+ var tmp = new StringBuilder ( ) ;
32833270 foreach ( string decl in declarations . OrderBy ( s => s ) )
32843271 {
32853272 for ( int i = 0 ; i < indent ; i ++ )
0 commit comments