Skip to content

arm64: Optimize ARM64 compare mask ExtractMostSignificantBits consumers#129688

Open
jonathandavies-arm wants to merge 3 commits into
dotnet:mainfrom
jonathandavies-arm:upstream/sve/intrinsic-ExtractMostSignificantBits
Open

arm64: Optimize ARM64 compare mask ExtractMostSignificantBits consumers#129688
jonathandavies-arm wants to merge 3 commits into
dotnet:mainfrom
jonathandavies-arm:upstream/sve/intrinsic-ExtractMostSignificantBits

Conversation

@jonathandavies-arm

@jonathandavies-arm jonathandavies-arm commented Jun 22, 2026

Copy link
Copy Markdown
Contributor

See discussion at #121981 (comment)

SuperPMI ASM diffs: ExtractMostSignificantBits

Base JIT: artifacts/asmdiff/builds/59979e64/core_root/libclrjit.so
Diff JIT: artifacts/tests/coreclr/linux.arm64.Checked/Tests/Core_Root/libclrjit.so
MCH: /tmp/ExtractMostSignificantBits_final.mch
Base commit: 59979e6401a (origin/main)
Diff commit: e37c3840564

Short summary

Diffs are based on 39 contexts (0 MinOpts, 39 FullOpts).

Overall (-480 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
ExtractMostSignificantBits_final.mch 8,492 -480 -28.40%
FullOpts (-480 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
ExtractMostSignificantBits_final.mch 8,492 -480 -28.40%

SuperPMI summary

Diffs are based on 39 contexts (0 MinOpts, 39 FullOpts).

Overall (-480 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
ExtractMostSignificantBits_final.mch 8,492 -480 -28.40%
FullOpts (-480 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
ExtractMostSignificantBits_final.mch 8,492 -480 -28.40%
Example diffs
ExtractMostSignificantBits_final.mch
-44 (-52.38%) : 17.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
@@ -7,10 +7,9 @@
 ; No matching PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T02] (  3,  3   )  simd16  ->   d0         single-def <System.Runtime.Intrinsics.Vector128`1[byte]>
+;  V00 arg0         [V00,T01] (  3,  3   )  simd16  ->   d0         single-def <System.Runtime.Intrinsics.Vector128`1[byte]>
 ;  V01 arg1         [V01,T00] (  3,  3   )   ubyte  ->   x0         single-def
 ;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
-;  V03 rat0         [V03,T01] (  3,  6   )  simd16  ->  d16         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -18,33 +17,20 @@ G_M10966_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
 						;; size=8 bbWeight=1 PerfScore 1.50
-G_M10966_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
+G_M10966_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             uxtb    w0, w0
             dup     v16.16b, w0
             cmhs    v16.16b, v0.16b, v16.16b
-            movi    v17.16b, #0x80
-            and     v16.16b, v16.16b, v17.16b
-            ldr     q17, [@RWD00]
-            ushl    v16.16b, v16.16b, v17.16b
-            uxtl2   v17.8h, v16.16b
-            shl     v17.8h, v17.8h, #8
-            uaddw   v16.8h, v17.8h, v16.8b
-            addv    h16, v16.8h
-            umov    w0, v16.h[0]
-            movi    v16.8b, #0
-            ins     v16.s[0], w0
-            cnt     v16.8b, v16.8b
-            addv    b16, v16.8b
-            umov    w0, v16.s[0]
-						;; size=68 bbWeight=1 PerfScore 20.00
+            ushr    v16.16b, v16.16b, #7
+            addv    b16, v16.16b
+            umov    w0, v16.b[0]
+						;; size=24 bbWeight=1 PerfScore 7.50
 G_M10966_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	00FFFEFDFCFBFAF9h, 00FFFEFDFCFBFAF9h
 
-
-; Total bytes of code 84, prolog size 8, PerfScore 23.50, instruction count 21, allocated bytes for code 84 (MethodHash=3eafd529) for method TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
+; Total bytes of code 40, prolog size 8, PerfScore 11.00, instruction count 10, allocated bytes for code 40 (MethodHash=3eafd529) for method TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -55,7 +41,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 21 (0x00015) Actual length = 84 (0x000054)
+  Function Length   : 10 (0x0000a) Actual length = 40 (0x000028)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-32 (-47.06%) : 12.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
@@ -17,29 +17,19 @@ G_M46948_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
 						;; size=8 bbWeight=1 PerfScore 1.50
-G_M46948_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
+G_M46948_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             dup     v16.4s, w0
             cmgt    v16.4s, v16.4s, v0.4s
-            movi    v17.4s, #0x80,  LSL #24
-            and     v16.4s, v16.4s, v17.4s
-            ldr     q17, [@RWD00]
-            ushl    v16.4s, v16.4s, v17.4s
+            ushr    v16.4s, v16.4s, #31
             addv    s16, v16.4s
-            smov    x0, v16.s[0]
-            movi    v16.8b, #0
-            ins     v16.s[0], w0
-            cnt     v16.8b, v16.8b
-            addv    b16, v16.8b
             umov    w0, v16.s[0]
-						;; size=52 bbWeight=1 PerfScore 15.50
+						;; size=20 bbWeight=1 PerfScore 7.00
 G_M46948_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	FFFFFFE2FFFFFFE1h, FFFFFFE4FFFFFFE3h
 
-
-; Total bytes of code 68, prolog size 8, PerfScore 19.00, instruction count 17, allocated bytes for code 68 (MethodHash=d8b1489b) for method TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
+; Total bytes of code 36, prolog size 8, PerfScore 10.50, instruction count 9, allocated bytes for code 36 (MethodHash=d8b1489b) for method TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -50,7 +40,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 17 (0x00011) Actual length = 68 (0x000044)
+  Function Length   : 9 (0x00009) Actual length = 36 (0x000024)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-32 (-47.06%) : 27.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
@@ -18,29 +18,19 @@ G_M44223_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
 						;; size=8 bbWeight=1 PerfScore 1.50
-G_M44223_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
+G_M44223_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             dup     v16.2s, w0
             cmgt    v16.2s, v16.2s, v0.2s
-            movi    v17.2s, #0x80,  LSL #24
-            and     v16.2s, v16.2s, v17.2s
-            ldr     d17, [@RWD00]
-            ushl    v16.2s, v16.2s, v17.2s
+            ushr    v16.2s, v16.2s, #31
             addp    v16.2s, v16.2s, v16.2s
-            smov    x0, v16.s[0]
-            movi    v16.8b, #0
-            ins     v16.s[0], w0
-            cnt     v16.8b, v16.8b
-            addv    b16, v16.8b
             umov    w0, v16.s[0]
-						;; size=52 bbWeight=1 PerfScore 15.50
+						;; size=20 bbWeight=1 PerfScore 7.00
 G_M44223_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	FFFFFFE2FFFFFFE1h
 
-
-; Total bytes of code 68, prolog size 8, PerfScore 19.00, instruction count 17, allocated bytes for code 68 (MethodHash=6ccd5340) for method TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
+; Total bytes of code 36, prolog size 8, PerfScore 10.50, instruction count 9, allocated bytes for code 36 (MethodHash=6ccd5340) for method TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -51,7 +41,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 17 (0x00011) Actual length = 68 (0x000044)
+  Function Length   : 9 (0x00009) Actual length = 36 (0x000024)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-8 (-13.33%) : 33.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
@@ -21,23 +21,21 @@ G_M59363_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             uxtb    w0, w0
             dup     v16.8b, w0
             cmhs    v16.8b, v0.8b, v16.8b
-            movi    v17.8b, #0x80
-            and     v16.8b, v16.8b, v17.8b
             ldr     d17, [@RWD00]
-            ushl    v16.8b, v16.8b, v17.8b
-            addv    b16, v16.8b
+            movi    v18.8b, #0x21
+            bsl     v16.8b, v17.8b, v18.8b
+            uminv   b16, v16.8b
             umov    w0, v16.b[0]
-            rbit    w0, w0
-            clz     w0, w0
-						;; size=44 bbWeight=1 PerfScore 12.00
+            sub     w0, w0, #1
+						;; size=36 bbWeight=1 PerfScore 11.00
 G_M59363_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	00FFFEFDFCFBFAF9h
+RWD00  	dq	0807060504030201h
 
 
-; Total bytes of code 60, prolog size 8, PerfScore 15.50, instruction count 15, allocated bytes for code 60 (MethodHash=312b181c) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
+; Total bytes of code 52, prolog size 8, PerfScore 14.50, instruction count 13, allocated bytes for code 52 (MethodHash=312b181c) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -48,7 +46,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 15 (0x0000f) Actual length = 60 (0x00003c)
+  Function Length   : 13 (0x0000d) Actual length = 52 (0x000034)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-8 (-13.33%) : 23.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
@@ -21,23 +21,21 @@ G_M61416_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             uxth    w0, w0
             dup     v16.4h, w0
             cmhs    v16.4h, v0.4h, v16.4h
-            movi    v17.4h, #0x80,  LSL #8
-            and     v16.4h, v16.4h, v17.4h
             ldr     d17, [@RWD00]
-            ushl    v16.4h, v16.4h, v17.4h
-            addv    h16, v16.4h
+            movi    v18.4h, #0x21
+            bsl     v16.4h, v17.4h, v18.4h
+            uminv   h16, v16.4h
             umov    w0, v16.h[0]
-            rbit    w0, w0
-            clz     w0, w0
-						;; size=44 bbWeight=1 PerfScore 12.00
+            sub     w0, w0, #1
+						;; size=36 bbWeight=1 PerfScore 11.00
 G_M61416_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	FFF4FFF3FFF2FFF1h
+RWD00  	dq	0004000300020001h
 
 
-; Total bytes of code 60, prolog size 8, PerfScore 15.50, instruction count 15, allocated bytes for code 60 (MethodHash=ba031017) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
+; Total bytes of code 52, prolog size 8, PerfScore 14.50, instruction count 13, allocated bytes for code 52 (MethodHash=ba031017) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -48,7 +46,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 15 (0x0000f) Actual length = 60 (0x00003c)
+  Function Length   : 13 (0x0000d) Actual length = 52 (0x000034)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-8 (-13.33%) : 8.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
@@ -21,23 +21,21 @@ G_M44819_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             uxth    w0, w0
             dup     v16.8h, w0
             cmhs    v16.8h, v0.8h, v16.8h
-            movi    v17.8h, #0x80,  LSL #8
-            and     v16.8h, v16.8h, v17.8h
             ldr     q17, [@RWD00]
-            ushl    v16.8h, v16.8h, v17.8h
-            addv    h16, v16.8h
+            movi    v18.8h, #0x21
+            bsl     v16.8h, v17.8h, v18.8h
+            uminv   h16, v16.8h
             umov    w0, v16.h[0]
-            rbit    w0, w0
-            clz     w0, w0
-						;; size=44 bbWeight=1 PerfScore 12.00
+            sub     w0, w0, #1
+						;; size=36 bbWeight=1 PerfScore 11.00
 G_M44819_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	FFF4FFF3FFF2FFF1h, FFF8FFF7FFF6FFF5h
+RWD00  	dq	0004000300020001h, 0008000700060005h
 
 
-; Total bytes of code 60, prolog size 8, PerfScore 15.50, instruction count 15, allocated bytes for code 60 (MethodHash=016950ec) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
+; Total bytes of code 52, prolog size 8, PerfScore 14.50, instruction count 13, allocated bytes for code 52 (MethodHash=016950ec) for method TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -48,7 +46,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 15 (0x0000f) Actual length = 60 (0x00003c)
+  Function Length   : 13 (0x0000d) Actual length = 52 (0x000034)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
Details

Size improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same size Improvements (bytes) Regressions (bytes)
ExtractMostSignificantBits_final.mch 24 24 0 0 -480 +0

PerfScore improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same PerfScore Improvements (PerfScore) Regressions (PerfScore) PerfScore Overall in FullOpts
ExtractMostSignificantBits_final.mch 24 24 0 0 -28.40% 0.00% -18.5844%

Context information

Collection Diffed contexts MinOpts FullOpts Missed, base Missed, diff
ExtractMostSignificantBits_final.mch 39 0 39 0 (0.00%) 0 (0.00%)

jit-analyze output

ExtractMostSignificantBits_final.mch

Summary of Code Size diffs:
(Lower is better)

Total bytes of base: 8492 (overridden on cmd)
Total bytes of diff: 8012 (overridden on cmd)
Total bytes of delta: -480 (-5.65 % of base)
    diff is an improvement.
    relative diff is an improvement.
Detail diffs


Top file improvements (bytes):
         -44 : 17.dasm (-52.38 % of base)
         -32 : 22.dasm (-44.44 % of base)
         -32 : 7.dasm (-44.44 % of base)
         -32 : 32.dasm (-44.44 % of base)
         -32 : 27.dasm (-47.06 % of base)
         -32 : 12.dasm (-47.06 % of base)
         -28 : 16.dasm (-38.89 % of base)
         -28 : 15.dasm (-38.89 % of base)
         -20 : 18.dasm (-27.78 % of base)
         -16 : 30.dasm (-26.67 % of base)
         -16 : 6.dasm (-26.67 % of base)
         -16 : 5.dasm (-26.67 % of base)
         -16 : 10.dasm (-28.57 % of base)
         -16 : 26.dasm (-28.57 % of base)
         -16 : 25.dasm (-28.57 % of base)
         -16 : 21.dasm (-26.67 % of base)
         -16 : 11.dasm (-28.57 % of base)
         -16 : 31.dasm (-26.67 % of base)
         -16 : 20.dasm (-26.67 % of base)
          -8 : 33.dasm (-13.33 % of base)

24 total files with Code Size differences (24 improved, 0 regressed), 0 unchanged.

Top method improvements (bytes):
         -44 (-52.38 % of base) : 17.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 32.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 7.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
         -32 (-44.44 % of base) : 22.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
         -32 (-47.06 % of base) : 12.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
         -32 (-47.06 % of base) : 27.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
         -28 (-38.89 % of base) : 15.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -28 (-38.89 % of base) : 16.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -20 (-27.78 % of base) : 18.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -16 (-26.67 % of base) : 30.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 10.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 25.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-26.67 % of base) : 5.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 20.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 31.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 11.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 26.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-26.67 % of base) : 6.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 21.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
          -8 (-13.33 % of base) : 33.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)

Top method improvements (percentages):
         -44 (-52.38 % of base) : 17.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -32 (-47.06 % of base) : 12.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
         -32 (-47.06 % of base) : 27.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
         -32 (-44.44 % of base) : 32.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 7.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
         -32 (-44.44 % of base) : 22.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
         -28 (-38.89 % of base) : 15.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -28 (-38.89 % of base) : 16.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 10.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 25.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 11.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 26.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -20 (-27.78 % of base) : 18.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -16 (-26.67 % of base) : 30.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-26.67 % of base) : 5.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 20.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 31.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-26.67 % of base) : 6.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 21.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
          -8 (-14.29 % of base) : 13.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)

24 total methods with Code Size differences (24 improved, 0 regressed).


Jit-analyze summary


Summary of Code Size diffs:
(Lower is better)

Total bytes of base: 8492 (overridden on cmd)
Total bytes of diff: 8012 (overridden on cmd)
Total bytes of delta: -480 (-5.65 % of base)
    diff is an improvement.
    relative diff is an improvement.
Detail diffs


Top file improvements (bytes):
         -44 : 17.dasm (-52.38 % of base)
         -32 : 22.dasm (-44.44 % of base)
         -32 : 7.dasm (-44.44 % of base)
         -32 : 32.dasm (-44.44 % of base)
         -32 : 27.dasm (-47.06 % of base)
         -32 : 12.dasm (-47.06 % of base)
         -28 : 16.dasm (-38.89 % of base)
         -28 : 15.dasm (-38.89 % of base)
         -20 : 18.dasm (-27.78 % of base)
         -16 : 30.dasm (-26.67 % of base)
         -16 : 6.dasm (-26.67 % of base)
         -16 : 5.dasm (-26.67 % of base)
         -16 : 10.dasm (-28.57 % of base)
         -16 : 26.dasm (-28.57 % of base)
         -16 : 25.dasm (-28.57 % of base)
         -16 : 21.dasm (-26.67 % of base)
         -16 : 11.dasm (-28.57 % of base)
         -16 : 31.dasm (-26.67 % of base)
         -16 : 20.dasm (-26.67 % of base)
          -8 : 33.dasm (-13.33 % of base)

24 total files with Code Size differences (24 improved, 0 regressed), 0 unchanged.

Top method improvements (bytes):
         -44 (-52.38 % of base) : 17.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 32.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 7.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
         -32 (-44.44 % of base) : 22.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
         -32 (-47.06 % of base) : 12.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
         -32 (-47.06 % of base) : 27.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
         -28 (-38.89 % of base) : 15.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -28 (-38.89 % of base) : 16.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -20 (-27.78 % of base) : 18.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -16 (-26.67 % of base) : 30.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 10.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 25.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-26.67 % of base) : 5.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 20.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 31.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 11.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 26.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-26.67 % of base) : 6.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 21.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
          -8 (-13.33 % of base) : 33.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)

Top method improvements (percentages):
         -44 (-52.38 % of base) : 17.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -32 (-47.06 % of base) : 12.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)
         -32 (-47.06 % of base) : 27.dasm - TestExtractMostSignificantBits.Program:CountLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):int (FullOpts)
         -32 (-44.44 % of base) : 32.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):int (FullOpts)
         -32 (-44.44 % of base) : 7.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):int (FullOpts)
         -32 (-44.44 % of base) : 22.dasm - TestExtractMostSignificantBits.Program:CountGreaterThanOrEqualUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):int (FullOpts)
         -28 (-38.89 % of base) : 15.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -28 (-38.89 % of base) : 16.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):bool (FullOpts)
         -16 (-28.57 % of base) : 10.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 25.dasm - TestExtractMostSignificantBits.Program:AnyLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 11.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):bool (FullOpts)
         -16 (-28.57 % of base) : 26.dasm - TestExtractMostSignificantBits.Program:NoneLessThanInt3264(System.Runtime.Intrinsics.Vector64`1[int],int):bool (FullOpts)
         -20 (-27.78 % of base) : 18.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstGreaterThanOrEqualByte(System.Runtime.Intrinsics.Vector128`1[byte],byte):int (FullOpts)
         -16 (-26.67 % of base) : 30.dasm - TestExtractMostSignificantBits.Program:AnyGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-26.67 % of base) : 5.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 20.dasm - TestExtractMostSignificantBits.Program:AnyLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 31.dasm - TestExtractMostSignificantBits.Program:NoneGreaterThanOrEqualByte64(System.Runtime.Intrinsics.Vector64`1[byte],byte):bool (FullOpts)
         -16 (-26.67 % of base) : 6.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt16(System.Runtime.Intrinsics.Vector128`1[ushort],ushort):bool (FullOpts)
         -16 (-26.67 % of base) : 21.dasm - TestExtractMostSignificantBits.Program:NoneLessThanUInt1664(System.Runtime.Intrinsics.Vector64`1[ushort],ushort):bool (FullOpts)
          -8 (-14.29 % of base) : 13.dasm - TestExtractMostSignificantBits.Program:IndexOfFirstLessThanInt32(System.Runtime.Intrinsics.Vector128`1[int],int):int (FullOpts)

24 total methods with Code Size differences (24 improved, 0 regressed).


Generated artifacts

  • SuperPMI log: artifacts/spmi/superpmi.27.log
  • Short summary: artifacts/spmi/diff_short_summary.11.md
  • Full summary: artifacts/spmi/diff_summary.11.md
  • Jit-analyze summary: artifacts/spmi/asm.ExtractMostSignificantBits_final/summary.md
  • Base asm: artifacts/spmi/asm.ExtractMostSignificantBits_final/base/
  • Diff asm: artifacts/spmi/asm.ExtractMostSignificantBits_final/diff/

@github-actions github-actions Bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jun 22, 2026
@dotnet-policy-service dotnet-policy-service Bot added the community-contribution Indicates that the PR has been added by a community member label Jun 22, 2026
@dotnet-policy-service

Copy link
Copy Markdown
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

@EgorBo

EgorBo commented Jun 22, 2026

Copy link
Copy Markdown
Member

Isn't it the same as #126790 ?
The difference is that doing everything in Rationalizer like this PR does will not allow to see that the input is 0/allbits sets unless it's directly propagated as a tree.

@a74nh

a74nh commented Jun 24, 2026

Copy link
Copy Markdown
Contributor

Isn't it the same as #126790 ? The difference is that doing everything in Rationalizer like this PR does will not allow to see that the input is 0/allbits sets unless it's directly propagated as a tree.

What's the status on #126790 ? It hasn't moved since April. Is it planned to get in for NET11 ?

@EgorBo

EgorBo commented Jun 24, 2026

Copy link
Copy Markdown
Member

Isn't it the same as #126790 ? The difference is that doing everything in Rationalizer like this PR does will not allow to see that the input is 0/allbits sets unless it's directly propagated as a tree.

What's the status on #126790 ? It hasn't moved since April. Is it planned to get in for NET11 ?

@tannergooding Optimized all Vector*.IndexOf* with SHRN on the C# side in this PR: #126678 and my PR basically became useless, see benchmark: #126790 (comment)

We still have a few places where we call Compare + EMSB instead of explicit IndexOf APIs, that is what my other PR tries to do: #126841

There will be a few places where we won't be able to replaces EMSB with IndexOf (e.g. iterators over multiple matches), though. But I don't think this PR does a good job for them either judging by the diffs -
{B481468C-6B1A-4F11-A66B-E5A76DAF7210}

Basically, +600 LOC of JIT changes for 1 use-case (context) in benchmarks.

@EgorBo

EgorBo commented Jun 24, 2026

Copy link
Copy Markdown
Member

If you want to make this transformation more useful, you need to borrow the assertionprop.cpp changes from #126790 - AllComponentsEitherZeroOrAllBitsSet so it can leave a GTF_* flag on a tree you handle in the rationalizer. Because expecting the input to be a comparison tree is a very conservative impl (hence, small diffs).

Teach assertion propagation on ARM64 to recognize Vector64/128 ExtractMostSignificantBits inputs whose value numbers represent per-element boolean masks. The helper recognizes comparison masks, all-zero/all-bits constants, boolean-preserving operations, and reaching PHI values, and marks the EMSB node with a HW intrinsic flag.

Consume the flag in rationalization so existing ExtractMostSignificantBits rewrites can handle mask values that have flowed through locals, while keeping unsupported element types filtered out. Add coverage for a comparison mask stored in a local before PopCount(ExtractMostSignificantBits()).
@EgorBo

EgorBo commented Jun 25, 2026

Copy link
Copy Markdown
Member

A lot bigger diffs now.

case GT_LE:
case GT_LT:
{
return varTypeIsIntegral(baseType) && (genTypeSize(baseType) <= genTypeSize(intrinsicSimdBaseType));

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd leave a comment for <=, the original impl had it.


if (comp->vnStore->VNVisitReachingVNs(op1VN, vnVisitor) == ValueNumStore::VNVisit::Continue)
{
tree->gtFlags |= GTF_HW_ZERO_OR_ALL_BITS_SET;

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to fix GenTree::Compare - it should take this flag into account and return false if the rest is the same.

// optAssertionProp_HWIntrinsic: Propagate VN-derived facts to hwintrinsic tree flags.
//
// Arguments:
// tree - The hwintrinsic node

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lack of comp arg

// Return Value:
// True if the hwintrinsic produces a mask where each SIMD element is either all-bits-set or zero.
//
static bool IsHWIntrinsicCmpMask(NamedIntrinsic intrinsic)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be shared with assertionprop

// Return Value:
// The normalized SIMD base type, or TYP_UNDEF if it is unsupported.
//
static var_types NormalizeCmpMaskSimdBaseType(var_types simdBaseType)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @hez2010 added a similar helper recently

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. There's a getIndexTypeForShuffle. I think it can be renamed to a more general term if we are going to reuse it here.

node = *useEdge;
}
}
else if (node->AsIntrinsic()->gtIntrinsicName == NI_PRIMITIVE_TrailingZeroCount)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to count LeadingZeroCount as well? (e.g. last index of I assume)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI community-contribution Indicates that the PR has been added by a community member

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants