JIT: Support bitwise field extractions from parameter registers #112740

jakobbotsch · 2025-02-20T14:53:32Z

Recent work now allows us to finally add support for the backend to extract fields out of parameters without spilling them to stack. Previously this was only supported when the fields mapped cleanly to registers.

A win-x64 example:

static int Foo(int? foo)
{
    return foo.HasValue ? foo.Value : 0;
}

 ; Method Program:Foo(System.Nullable`1[int]):int (FullOpts)
 G_M19236_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=0.50 PerfScore 0.50
+						;; size=0 bbWeight=0.50 PerfScore 0.00

-G_M19236_IG02:  ;; offset=0x0005
-       movzx    rcx, cl
-       xor      eax, eax
-       test     ecx, ecx
-       cmovne   eax, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=0.50 PerfScore 1.38
+G_M19236_IG02:  ;; offset=0x0000
+       movzx    rax, cl
+       shr      rcx, 32
+       xor      edx, edx
+       test     eax, eax
+       mov      eax, edx
+       cmovne   eax, ecx
+						;; size=16 bbWeight=0.50 PerfScore 0.88

-G_M19236_IG03:  ;; offset=0x0011
+G_M19236_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=0.50 PerfScore 0.50
-; Total bytes of code: 18
-
+; Total bytes of code: 17

Another win-x64 example:

static float Sum(PointF p)
{
    return p.X + p.Y;
}

 ; Method Program:Sum(System.Drawing.PointF):float (FullOpts)
 G_M48891_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=1 PerfScore 1.00
+						;; size=0 bbWeight=1 PerfScore 0.00

-G_M48891_IG02:  ;; offset=0x0005
-       vmovss   xmm0, dword ptr [rsp+0x08]
-       vaddss   xmm0, xmm0, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=1 PerfScore 8.00
+G_M48891_IG02:  ;; offset=0x0000
+       vmovd    xmm0, ecx
+       shr      rcx, 32
+       vmovd    xmm1, ecx
+       vaddss   xmm0, xmm0, xmm1
+						;; size=16 bbWeight=1 PerfScore 7.50

-G_M48891_IG03:  ;; offset=0x0011
+G_M48891_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code: 18
+; Total bytes of code: 17

An arm64 example:

static bool Test(Memory<int> mem)
{
    return mem.Length > 10;
}

 ; Method Program:Test(System.Memory`1[int]):ubyte (FullOpts)
 G_M53448_IG01:  ;; offset=0x0000
-            stp     fp, lr, [sp, #-0x20]!
+            stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
-            stp     x0, x1, [fp, #0x10]	// [V00 arg0], [V00 arg0+0x08]
-						;; size=12 bbWeight=1 PerfScore 2.50
+						;; size=8 bbWeight=1 PerfScore 1.50

-G_M53448_IG02:  ;; offset=0x000C
-            ldr     w0, [fp, #0x1C]	// [V00 arg0+0x0c]
+G_M53448_IG02:  ;; offset=0x0008
+            lsr     x0, x1, #32
             cmp     w0, #10
             cset    x0, gt
-						;; size=12 bbWeight=1 PerfScore 3.00
+						;; size=12 bbWeight=1 PerfScore 2.00

-G_M53448_IG03:  ;; offset=0x0018
-            ldp     fp, lr, [sp], #0x20
+G_M53448_IG03:  ;; offset=0x0014
+            ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 32
+; Total bytes of code: 28

Float -> float extractions that do not map cleanly is still not supported, but should be doable (via vector register extractions). Float -> int extractions are not supported, but I'm not sure we see these.

This is often not a code size improvement, but typically a perfscore improvement. Also this seems to have some bad interactions with call arguments since they do not yet support something similar, but hopefully that can be improved separately.

This should fix a number of issues that I need to go find.

Recent work now allows us to finally add support for the backend to extract fields out of parameters without spilling them to stack. Previously this was only supported when the fields mapped cleanly to registers. A win-x64 example: ```csharp static int Foo(int? foo) { return foo.HasValue ? foo.Value : 0; } ``` ```diff ; Method Program:Foo(System.Nullable`1[int]):int (FullOpts) G_M19236_IG01: ;; offset=0x0000 - mov qword ptr [rsp+0x08], rcx - ;; size=5 bbWeight=0.50 PerfScore 0.50 + ;; size=0 bbWeight=0.50 PerfScore 0.00 -G_M19236_IG02: ;; offset=0x0005 - movzx rcx, cl - xor eax, eax - test ecx, ecx - cmovne eax, dword ptr [rsp+0x0C] - ;; size=12 bbWeight=0.50 PerfScore 1.38 +G_M19236_IG02: ;; offset=0x0000 + movzx rax, cl + shr rcx, 32 + xor edx, edx + test eax, eax + mov eax, edx + cmovne eax, ecx + ;; size=16 bbWeight=0.50 PerfScore 0.88 -G_M19236_IG03: ;; offset=0x0011 +G_M19236_IG03: ;; offset=0x0010 ret ;; size=1 bbWeight=0.50 PerfScore 0.50 -; Total bytes of code: 18 - +; Total bytes of code: 17 ``` Another win-x64 example: ```csharp static float Sum(PointF p) { return p.X + p.Y; } ``` ```diff ; Method Program:Sum(System.Drawing.PointF):float (FullOpts) G_M48891_IG01: ;; offset=0x0000 - mov qword ptr [rsp+0x08], rcx - ;; size=5 bbWeight=1 PerfScore 1.00 + ;; size=0 bbWeight=1 PerfScore 0.00 -G_M48891_IG02: ;; offset=0x0005 - vmovss xmm0, dword ptr [rsp+0x08] - vaddss xmm0, xmm0, dword ptr [rsp+0x0C] - ;; size=12 bbWeight=1 PerfScore 8.00 +G_M48891_IG02: ;; offset=0x0000 + vmovd xmm0, ecx + shr rcx, 32 + vmovd xmm1, ecx + vaddss xmm0, xmm0, xmm1 + ;; size=16 bbWeight=1 PerfScore 7.50 -G_M48891_IG03: ;; offset=0x0011 +G_M48891_IG03: ;; offset=0x0010 ret ;; size=1 bbWeight=1 PerfScore 1.00 -; Total bytes of code: 18 +; Total bytes of code: 17 ``` An arm64 example: ```csharp static bool Test(Memory<int> mem) { return mem.Length > 10; } ``` ```diff ; Method Program:Test(System.Memory`1[int]):ubyte (FullOpts) G_M53448_IG01: ;; offset=0x0000 - stp fp, lr, [sp, #-0x20]! + stp fp, lr, [sp, #-0x10]! mov fp, sp - stp x0, x1, [fp, #0x10] // [V00 arg0], [V00 arg0+0x08] - ;; size=12 bbWeight=1 PerfScore 2.50 + ;; size=8 bbWeight=1 PerfScore 1.50 -G_M53448_IG02: ;; offset=0x000C - ldr w0, [fp, #0x1C] // [V00 arg0+0x0c] +G_M53448_IG02: ;; offset=0x0008 + lsr x0, x1, dotnet#32 cmp w0, dotnet#10 cset x0, gt - ;; size=12 bbWeight=1 PerfScore 3.00 + ;; size=12 bbWeight=1 PerfScore 2.00 -G_M53448_IG03: ;; offset=0x0018 - ldp fp, lr, [sp], #0x20 +G_M53448_IG03: ;; offset=0x0014 + ldp fp, lr, [sp], #0x10 ret lr ;; size=8 bbWeight=1 PerfScore 2.00 -; Total bytes of code: 32 +; Total bytes of code: 28 ```

dotnet-policy-service · 2025-02-20T14:54:15Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

…ameters

jakobbotsch · 2025-02-24T10:26:56Z

/azp run runtime-coreclr superpmi-replay, runtime-coreclr superpmi-diffs

azure-pipelines · 2025-02-24T10:27:15Z

Azure Pipelines successfully started running 2 pipeline(s).

src/coreclr/jit/lower.cpp

…ameters

jakobbotsch · 2025-02-26T09:08:08Z

/azp run runtime-coreclr superpmi-diffs

azure-pipelines · 2025-02-26T09:08:24Z

Azure Pipelines successfully started running 1 pipeline(s).

Based on the new `FIELD_LIST` support for returns this PR adds support for the JIT to combine smaller fields via bitwise operations when returned, instead of spilling these to stack. win-x64 examples: ```csharp static int? Test() { return Environment.TickCount; } ``` ```diff call System.Environment:get_TickCount():int - mov dword ptr [rsp+0x24], eax - mov byte ptr [rsp+0x20], 1 - mov rax, qword ptr [rsp+0x20] - ;; size=19 bbWeight=1 PerfScore 4.00 + mov eax, eax + shl rax, 32 + or rax, 1 + ;; size=15 bbWeight=1 PerfScore 2.00 ``` (the `mov eax, eax` is unnecessary, but not that simple to get rid of) ```csharp static (int x, float y) Test(int x, float y) { return (x, y); } ``` ```diff - mov dword ptr [rsp], ecx - vmovss dword ptr [rsp+0x04], xmm1 - mov rax, qword ptr [rsp] + vmovd eax, xmm1 + shl rax, 32 + mov ecx, ecx + or rax, rcx ;; size=13 bbWeight=1 PerfScore 3.00 ``` An arm64 example: ```csharp static Memory<int> ToMemory(int[] arr) { return arr.AsMemory(); } ``` ```diff G_M45070_IG01: ;; offset=0x0000 - stp fp, lr, [sp, #-0x20]! + stp fp, lr, [sp, #-0x10]! mov fp, sp - str xzr, [fp, #0x10] // [V03 tmp2] - ;; size=12 bbWeight=1 PerfScore 2.50 -G_M45070_IG02: ;; offset=0x000C + ;; size=8 bbWeight=1 PerfScore 1.50 +G_M45070_IG02: ;; offset=0x0008 cbz x0, G_M45070_IG06 ;; size=4 bbWeight=1 PerfScore 1.00 -G_M45070_IG03: ;; offset=0x0010 - str x0, [fp, #0x10] // [V07 tmp6] - str wzr, [fp, #0x18] // [V08 tmp7] - ldr x0, [fp, #0x10] // [V07 tmp6] - ldr w0, [x0, #0x08] - str w0, [fp, #0x1C] // [V09 tmp8] - ;; size=20 bbWeight=0.80 PerfScore 6.40 -G_M45070_IG04: ;; offset=0x0024 - ldp x0, x1, [fp, #0x10] // [V03 tmp2], [V03 tmp2+0x08] - ;; size=4 bbWeight=1 PerfScore 3.00 -G_M45070_IG05: ;; offset=0x0028 - ldp fp, lr, [sp], #0x20 +G_M45070_IG03: ;; offset=0x000C + ldr w1, [x0, #0x08] + ;; size=4 bbWeight=0.80 PerfScore 2.40 +G_M45070_IG04: ;; offset=0x0010 + mov w1, w1 + mov x2, xzr + orr x1, x2, x1, LSL #32 + ;; size=12 bbWeight=1 PerfScore 2.00 +G_M45070_IG05: ;; offset=0x001C + ldp fp, lr, [sp], #0x10 ret lr ;; size=8 bbWeight=1 PerfScore 2.00 -G_M45070_IG06: ;; offset=0x0030 - str xzr, [fp, #0x10] // [V07 tmp6] - str xzr, [fp, #0x18] +G_M45070_IG06: ;; offset=0x0024 + mov x0, xzr + mov w1, wzr b G_M45070_IG04 - ;; size=12 bbWeight=0.20 PerfScore 0.60 + ;; size=12 bbWeight=0.20 PerfScore 0.40 ``` (sneak peek -- this codegen requires some supplementary changes, and there's additional opportunities here) This is the return counterpart to #112740. That PR has a bunch of regressions that makes it look like we need to support returns/call arguments first, before we try to support parameters. There's a few follow-ups here: - Support for float->float insertions (when a float value needs to be returned as the 1st, 2nd, .... field of a SIMD register) - Support for coalescing memory loads, particularly because the fields of the `FIELD_LIST` come from a promoted struct that ended up DNER. In those cases we should be able to recombine the fields back to a single large field, instead of combining them with bitwise operations. - Support for constant folding the bitwise insertions. This requires some more constant folding support in lowering. - The JIT has lots of (now outdated) restrictions based around multi-reg returns that get in the way. Lifting these should improve things considerably.

…ed-call-arg-fields

…ameters

jakobbotsch · 2025-08-29T09:47:37Z

Spot checking some regressions with #118778 included:

27865.dasm - Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)

@@ -20,7 +20,9 @@
 ;* V10 tmp7         [V10    ] (  0,  0   )     int  ->  zero-ref    "field V05.value (fldOffset=0x4)" P-INDEP
 ;* V11 tmp8         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "field V07.hasValue (fldOffset=0x0)" P-INDEP
 ;* V12 tmp9         [V12    ] (  0,  0   )     int  ->  zero-ref    "field V07.value (fldOffset=0x4)" P-INDEP
-;  V13 rat0         [V13,T02] (  3,  3   )    long  ->  rdx         "V01.rdx"
+;  V13 tmp10        [V13,T04] (  2,  2   )   ubyte  ->  rax         "V01.[000..001)"
+;  V14 tmp11        [V14,T05] (  2,  2   )     int  ->  rdx         "V01.[004..008)"
+;  V15 rat0         [V15,T02] (  4,  4   )    long  ->  rdx         "V01.rdx"
 ;
 ; Lcl frame size = 0
 
@@ -28,19 +30,24 @@ G_M3648_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M3648_IG02:        ; bbWeight=1, gcrefRegs=0002 {rcx}, byrefRegs=0000 {}, byref
        ; gcrRegs +[rcx]
-       mov      eax, 1
-       mov      r10d, 2
+       movzx    rax, dl
+       shr      rdx, 32
+       mov      r10d, 1
+       mov      r9d, 2
        test     r8b, r8b
-       mov      r8d, r10d
-       cmovne   r8d, eax
+       mov      r8d, r9d
+       cmovne   r8d, r10d
+       mov      edx, edx
+       shl      rdx, 32
+       or       rdx, rax
        mov      rax, qword ptr [rcx]
        mov      rax, qword ptr [rax+0x60]
-						;; size=28 bbWeight=1 PerfScore 5.25
+						;; size=45 bbWeight=1 PerfScore 7.00
 G_M3648_IG03:        ; bbWeight=1, epilog, nogc, extend
        tail.jmp [rax+0x38]<unknown method>
 						;; size=4 bbWeight=1 PerfScore 4.00
 
-; Total bytes of code 32, prolog size 0, PerfScore 9.25, instruction count 8, allocated bytes for code 32 (MethodHash=90bdf1bf) for method Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)
+; Total bytes of code 49, prolog size 0, PerfScore 11.00, instruction count 13, allocated bytes for code 49 (MethodHash=90bdf1bf) for method Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)
 ; ============================================================

We end up promoting a Nullable<int> into two fields even thought it is only used as a parameter/argument. That happens because it gets assigned to a local in the function that is promoted into those two fields by old promotion.

Previously we did not promote it and end up copy propagating it such that the final call in this function takes LCL_FLD<long>(V01). The backend ends up treating the parameter as a single promoted register, which is better.

This should eventually be better: the idea is that once old promotion is removed there is no point in promoting this local as a bool + int; instead ABI induced promotion will result in the same promotion as a single long and then we will get the same codegen as before. It requires some more work that will happen in .NET 11.

…ameters

jakobbotsch · 2025-09-04T23:59:56Z

/azp run runtime-coreclr superpmi-diffs

azure-pipelines · 2025-09-05T00:00:56Z

Azure Pipelines successfully started running 1 pipeline(s).

jakobbotsch · 2025-09-05T12:46:24Z

446496.dasm - System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)

@@ -11,15 +11,16 @@
 ;
 ;* V00 arg0         [V00    ] (  0,  0   )   byref  ->  zero-ref    ld-addr-op single-def
 ;* V01 arg1         [V01    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op single-def <System.Nullable`1[float]>
-;  V02 arg2         [V02,T01] (  3,  2   )  struct ( 8) [rsp+0x18]  do-not-enreg[SF] ld-addr-op single-def <System.Nullable`1[System.Half]>
+;  V02 arg2         [V02,T00] (  4,  2   )  struct ( 8) [rsp+0x18]  do-not-enreg[SF] ld-addr-op single-def <System.Nullable`1[System.Half]>
 ;* V03 loc0         [V03    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op <System.Nullable`1[System.Half]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "field V00.hasValue (fldOffset=0x0)" P-INDEP
 ;* V06 tmp2         [V06    ] (  0,  0   )  double  ->  zero-ref    "field V00.value (fldOffset=0x8)" P-INDEP
 ;* V07 tmp3         [V07    ] (  0,  0   )   ubyte  ->  zero-ref    single-def "field V03.hasValue (fldOffset=0x0)" P-INDEP
 ;* V08 tmp4         [V08    ] (  0,  0   )  ushort  ->  zero-ref    single-def "field V03.value (fldOffset=0x2)" P-INDEP
-;* V09 tmp5         [V09    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref" <System.Nullable`1[double]>
-;  V10 rat0         [V10,T00] (  5,  5   )   ubyte  ->   r8         "V02.r8"
+;  V09 tmp5         [V09,T02] (  3,  2   )   ubyte  ->   r8         single-def "V02.[000..001)"
+;* V10 tmp6         [V10    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref" <System.Nullable`1[double]>
+;  V11 rat0         [V11,T01] (  3,  3   )    long  ->   r8         "V02.r8"
 ;
 ; Lcl frame size = 0
 
@@ -36,13 +37,14 @@ G_M4475_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 G_M4475_IG04:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
+       mov      byte  ptr [rsp+0x18], r8b
        mov      eax, dword ptr [rsp+0x18]
-						;; size=4 bbWeight=0 PerfScore 0.00
+						;; size=9 bbWeight=0 PerfScore 0.00
 G_M4475_IG05:        ; bbWeight=0, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 22, prolog size 0, PerfScore 3.75, instruction count 8, allocated bytes for code 22 (MethodHash=3ebaee84) for method System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)
+; Total bytes of code 27, prolog size 0, PerfScore 3.75, instruction count 9, allocated bytes for code 27 (MethodHash=3ebaee84) for method System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)
 ; ============================================================
 
 Unwind Info:

In this one we now end up promoting the _hasValue field of an incoming parameter nullable, but we do not promote the _value. We then later return it (as a LCL_FLD<nullable>), which means we now have to write the _hasValue field back before that return.

Two things we can do here:

Running physical promotion in RPO we can sometimes (like in this case) keep fields considered up-to-date in their stack home. It would get rid of the stack store.
Once we have promoted _hasValue, we can realize that it might be a good idea to promote _value as well, to avoid a partially promoted local. ABI considerations may play into this decision.

jakobbotsch · 2025-09-05T12:57:45Z

System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)

@@ -12,15 +12,15 @@
 ;  V00 arg0         [V00,T02] (  6,  4.16)     ref  ->  rdi         class-hnd single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[]>
 ;  V01 arg1         [V01,T00] (  5,  5   )     int  ->  rbx         single-def
 ;  V02 arg2         [V02,T01] (  5,  5   )     int  ->  rsi         single-def
-;  V03 arg3         [V03,T04] (  4,  2.08)  struct ( 8) [rsp+0x78]  do-not-enreg[SF] single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
-;  V04 arg4         [V04,T12] (  2,  0.08)     ref  ->  [rsp+0x80]  class-hnd single-def <System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]>
-;  V05 loc0         [V05,T03] (  8,  4.30)     int  ->  rbx        
+;* V03 arg3         [V03    ] (  0,  0   )  struct ( 8) zero-ref    do-not-enreg[SF] single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
+;  V04 arg4         [V04,T14] (  2,  0.08)     ref  ->  [rsp+0x90]  class-hnd single-def <System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]>
+;  V05 loc0         [V05,T04] (  8,  4.30)     int  ->  rbx        
 ;  V06 loc1         [V06,T05] (  5,  2.17)     int  ->  rsi        
-;  V07 loc2         [V07,T07] (  6,  0.32)     int  ->  rbp        
+;  V07 loc2         [V07,T10] (  6,  0.32)     int  ->  r13        
 ;* V08 loc3         [V08    ] (  0,  0   )     int  ->  zero-ref   
 ;  V09 OutArgs      [V09    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
-;* V10 tmp1         [V10,T15] (  0,  0   )     int  ->  zero-ref   
-;  V11 tmp2         [V11,T08] (  4,  0.24)     int  ->  r15         "guarded devirt return temp"
+;* V10 tmp1         [V10,T17] (  0,  0   )     int  ->  zero-ref   
+;  V11 tmp2         [V11,T11] (  4,  0.24)     int  ->  rax         "guarded devirt return temp"
 ;* V12 tmp3         [V12    ] (  0,  0   )  struct ( 8) zero-ref    "guarded devirt arg temp" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
 ;* V13 tmp4         [V13    ] (  0,  0   )     ref  ->  zero-ref    class-hnd exact "guarded devirt this exact temp" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap+PositionComparer>
 ;* V14 tmp5         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
@@ -30,39 +30,50 @@
 ;* V18 tmp9         [V18    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op "Inlining Arg" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
 ;* V19 tmp10        [V19    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
 ;* V20 tmp11        [V20    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op "Inlining Arg" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
-;  V21 tmp12        [V21,T11] (  4,  0.16)     int  ->  r15         "Inline return value spill temp"
+;  V21 tmp12        [V21,T13] (  4,  0.16)     int  ->  rax         "Inline return value spill temp"
 ;* V22 tmp13        [V22    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V23 tmp14        [V23,T09] (  4,  0.18)     int  ->  rcx         "field V12.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
-;  V24 tmp15        [V24,T13] (  2,  0.08)   ubyte  ->   r8         "field V12.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
-;  V25 tmp16        [V25,T14] (  2,  0.08)   ubyte  ->  rdx         "field V12.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
+;  V23 tmp14        [V23,T12] (  4,  0.18)     int  ->   r8         "field V12.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
+;  V24 tmp15        [V24,T15] (  2,  0.08)   ubyte  ->  rcx         "field V12.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
+;  V25 tmp16        [V25,T16] (  2,  0.08)   ubyte  ->  rdx         "field V12.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
 ;* V26 tmp17        [V26    ] (  0,  0   )     int  ->  zero-ref    "field V18.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
 ;* V27 tmp18        [V27    ] (  0,  0   )   ubyte  ->  zero-ref    "field V18.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
 ;* V28 tmp19        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "field V18.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
-;  V29 tmp20        [V29,T10] (  3,  0.18)     int  ->   r8         "field V20.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
+;* V29 tmp20        [V29    ] (  0,  0   )     int  ->  zero-ref    "field V20.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
 ;* V30 tmp21        [V30    ] (  0,  0   )   ubyte  ->  zero-ref    "field V20.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
 ;* V31 tmp22        [V31    ] (  0,  0   )   ubyte  ->  zero-ref    "field V20.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
-;  V32 tmp23        [V32,T06] (  4,  0.63)   byref  ->  rdx         "BlockOp address local"
+;  V32 tmp23        [V32,T06] (  4,  1.10)     int  ->  rbp         "V03.[000..004)"
+;  V33 tmp24        [V33,T07] (  2,  1   )   ubyte  ->  r14         "V03.[004..005)"
+;  V34 tmp25        [V34,T08] (  2,  1   )   ubyte  ->  r15         "V03.[005..006)"
+;  V35 tmp26        [V35,T09] (  4,  0.63)   byref  ->  rdx         "BlockOp address local"
+;  V36 rat0         [V36,T03] (  5,  5   )    long  ->   r9         "V03.r9"
 ;
 ; Lcl frame size = 40
 
 G_M8241_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r15
        push     r14
+       push     r13
+       push     r12
        push     rdi
        push     rsi
        push     rbp
        push     rbx
        sub      rsp, 40
-       mov      qword ptr [rsp+0x78], r9
        mov      rdi, rcx
        ; gcrRegs +[rdi]
        mov      ebx, edx
        mov      esi, r8d
-						;; size=25 bbWeight=1 PerfScore 8.00
+						;; size=24 bbWeight=1 PerfScore 9.00
 G_M8241_IG02:        ; bbWeight=1, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, byref, isz
+       mov      ebp, r9d
+       mov      r14, r9
+       shr      r14, 32
+       movzx    r14, r14b
+       shr      r9, 40
+       movzx    r15, r9b
        test     rdi, rdi
        jne      SHORT G_M8241_IG04
-						;; size=5 bbWeight=1 PerfScore 1.25
+						;; size=27 bbWeight=1 PerfScore 3.25
 G_M8241_IG03:        ; bbWeight=0.80, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, byref
        mov      rcx, 0xD1FFAB1E
        ; gcrRegs +[rcx]
@@ -106,105 +117,113 @@ G_M8241_IG08:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r12
+       pop      r13
        pop      r14
        pop      r15
        ret      
-						;; size=13 bbWeight=1 PerfScore 4.25
+						;; size=17 bbWeight=1 PerfScore 5.25
 G_M8241_IG09:        ; bbWeight=0.08, gcVars=0000000000000000 {}, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, gcvars, byref, isz
        ; gcrRegs +[rdi]
-       mov      ebp, esi
-       sub      ebp, ebx
-       sar      ebp, 1
-       add      ebp, ebx
-       cmp      ebp, dword ptr [rdi+0x08]
+       mov      r13d, esi
+       sub      r13d, ebx
+       sar      r13d, 1
+       add      r13d, ebx
+       cmp      r13d, dword ptr [rdi+0x08]
        jae      G_M8241_IG20
-       mov      edx, ebp
+       mov      edx, r13d
        lea      rdx, bword ptr [rdi+8*rdx+0x10]
        ; byrRegs +[rdx]
-       mov      ecx, dword ptr [rdx]
-       movzx    r8, byte  ptr [rdx+0x04]
+       mov      r8d, dword ptr [rdx]
+       movzx    rcx, byte  ptr [rdx+0x04]
        movzx    rdx, byte  ptr [rdx+0x05]
        ; byrRegs -[rdx]
        mov      r11, 0xD1FFAB1E      ; Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap+PositionComparer
-       mov      r14, gword ptr [rsp+0x80]
-       ; gcrRegs +[r14]
-       cmp      qword ptr [r14], r11
+       mov      r12, gword ptr [rsp+0x90]
+       ; gcrRegs +[r12]
+       cmp      qword ptr [r12], r11
        jne      SHORT G_M8241_IG17
-       mov      r8d, dword ptr [rsp+0x78]
-       cmp      ecx, r8d
+       cmp      r8d, ebp
        jge      SHORT G_M8241_IG14
-						;; size=68 bbWeight=0.08 PerfScore 1.58
-G_M8241_IG10:        ; bbWeight=0.06, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref
-       mov      r15d, -1
-						;; size=6 bbWeight=0.06 PerfScore 0.01
-G_M8241_IG11:        ; bbWeight=0.08, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       test     r15d, r15d
+						;; size=70 bbWeight=0.08 PerfScore 1.50
+G_M8241_IG10:        ; bbWeight=0.06, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref
+       mov      eax, -1
+						;; size=5 bbWeight=0.06 PerfScore 0.01
+G_M8241_IG11:        ; bbWeight=0.08, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       test     eax, eax
        je       SHORT G_M8241_IG18
-       test     r15d, r15d
+       test     eax, eax
        jge      SHORT G_M8241_IG15
-						;; size=10 bbWeight=0.08 PerfScore 0.20
-G_M8241_IG12:        ; bbWeight=0.06, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref
-       lea      ebx, [rbp+0x01]
-						;; size=3 bbWeight=0.06 PerfScore 0.03
-G_M8241_IG13:        ; bbWeight=0.08, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
+						;; size=8 bbWeight=0.08 PerfScore 0.20
+G_M8241_IG12:        ; bbWeight=0.06, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref
+       lea      ebx, [r13+0x01]
+						;; size=4 bbWeight=0.06 PerfScore 0.03
+G_M8241_IG13:        ; bbWeight=0.08, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
        cmp      ebx, esi
-       mov      gword ptr [rsp+0x80], r14
+       mov      gword ptr [rsp+0x90], r12
        jle      SHORT G_M8241_IG09
        jmp      SHORT G_M8241_IG07
 						;; size=14 bbWeight=0.08 PerfScore 0.34
-G_M8241_IG14:        ; bbWeight=0.02, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       cmp      ecx, r8d
+G_M8241_IG14:        ; bbWeight=0.02, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       cmp      r8d, ebp
        jle      SHORT G_M8241_IG16
-       mov      r15d, 1
+       mov      eax, 1
        jmp      SHORT G_M8241_IG11
-						;; size=13 bbWeight=0.02 PerfScore 0.07
-G_M8241_IG15:        ; bbWeight=0.01, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       lea      esi, [rbp-0x01]
+						;; size=12 bbWeight=0.02 PerfScore 0.07
+G_M8241_IG15:        ; bbWeight=0.01, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       lea      esi, [r13-0x01]
        jmp      SHORT G_M8241_IG13
-						;; size=5 bbWeight=0.01 PerfScore 0.04
-G_M8241_IG16:        ; bbWeight=0, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       xor      r15d, r15d
+						;; size=6 bbWeight=0.01 PerfScore 0.04
+G_M8241_IG16:        ; bbWeight=0, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       xor      eax, eax
        jmp      SHORT G_M8241_IG11
-						;; size=5 bbWeight=0 PerfScore 0.00
-G_M8241_IG17:        ; bbWeight=0, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       mov      r8d, r8d
-       shl      r8, 32
+						;; size=4 bbWeight=0 PerfScore 0.00
+G_M8241_IG17:        ; bbWeight=0, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
        mov      ecx, ecx
-       or       r8, rcx
+       shl      rcx, 32
+       mov      r8d, r8d
+       or       rcx, r8
        mov      edx, edx
        shl      rdx, 40
-       or       rdx, r8
-       mov      rcx, r14
+       or       rdx, rcx
+       mov      r8d, r14d
+       shl      r8, 32
+       mov      ecx, ebp
+       or       r8, rcx
+       mov      ecx, r15d
+       shl      rcx, 40
+       or       r8, rcx
+       mov      rcx, r12
        ; gcrRegs +[rcx]
-       mov      r8, qword ptr [rsp+0x78]
        mov      r11, 0xD1FFAB1E      ; code for <unknown method>
        call     [r11]<unknown method>
        ; gcrRegs -[rcx]
        ; gcr arg pop 0
-       mov      r15d, eax
        jmp      SHORT G_M8241_IG11
-						;; size=47 bbWeight=0 PerfScore 0.00
+						;; size=61 bbWeight=0 PerfScore 0.00
 G_M8241_IG18:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
-       ; gcrRegs -[rdi r14]
-       mov      eax, ebp
-						;; size=2 bbWeight=0 PerfScore 0.00
+       ; gcrRegs -[rdi r12]
+       mov      eax, r13d
+						;; size=3 bbWeight=0 PerfScore 0.00
 G_M8241_IG19:        ; bbWeight=0, epilog, nogc, extend
        add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r12
+       pop      r13
        pop      r14
        pop      r15
        ret      
-						;; size=13 bbWeight=0 PerfScore 0.00
+						;; size=17 bbWeight=0 PerfScore 0.00
 G_M8241_IG20:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        call     CORINFO_HELP_RNGCHKFAIL
        ; gcr arg pop 0
        int3     
 						;; size=6 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 314, prolog size 25, PerfScore 29.11, instruction count 99, allocated bytes for code 314 (MethodHash=5ad7dfce) for method System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)
+; Total bytes of code 357, prolog size 24, PerfScore 33.04, instruction count 114, allocated bytes for code 357 (MethodHash=5ad7dfce) for method System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)
 ; ============================================================
 
 Unwind Info:
@@ -212,15 +231,17 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
...

We now promote fields of a number of parameter structs, which leads to a bunch of bitwise extractions. It saves spilling in the prolog, but higher register pressure just leads to more callee saves.

This might improve with more intelligent ABI handling; currently those field promotions happen because we see assignments between old promoted locals and the parameter. If everything was left up to physical promotion, and we then had ABI based promotion, then we may have chosen to promote this in a way that does not require bitwise operations.

Overall I am not super worried about diffs like this, since the point of this PR is to get rid of spills in the general case by introducing bitwise operations to extract fields. I think in general that's the choice we want to make.

Copilot

Pull Request Overview

This PR adds support for bitwise field extractions from parameter registers, eliminating the need to spill parameters to stack when accessing struct fields. The implementation extends the JIT's ability to extract fields directly from parameter registers using bit operations like shifts and masks.

Key changes:

Enhanced parameter register mapping logic to support bitwise field extractions
Modified field access detection to allow non-exact register mappings
Added support for extracting fields at arbitrary offsets within parameter registers

Reviewed Changes

Copilot reviewed 2 out of 2 changed files in this pull request and generated no comments.

File	Description
src/coreclr/jit/promotion.cpp	Updated parameter register mapping detection to support bitwise extractions instead of requiring exact field-to-register alignment
src/coreclr/jit/lower.cpp	Replaced field access logic with comprehensive bitwise extraction implementation, including shift operations and type conversions

jakobbotsch · 2025-09-05T13:01:51Z

cc @dotnet/jit-contrib PTAL @AndyAyersMS

Diffs, diffs with old promotion disabled.

Overall this is a size regression but perfscore improvement. That's expected since we are replacing spills (that are quite compact) with bitwise extraction sequences that are less so.

I analyzed some of the regressions above. Many of them I will aim to improve as part of .NET 11; I think a lot of improvements will come from removing old promotion such that physical promotion can be smarter about promoting register passed parameters and arguments. For example, a struct value only used as a parameter/argument should be promoted as primitives that map to the registers that it is passed in. Today we often promote these as fields that require bitwise operations to extract/assemble because we see assignments to old promoted locals.

jakobbotsch · 2025-09-09T13:42:15Z

/ba-g Issue is known, but build analysis is not unblocking

jakobbotsch · 2025-09-09T13:42:37Z

Hmm, realized I never ran stress on this, so will do that first.

jakobbotsch · 2025-09-09T13:42:52Z

/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress

azure-pipelines · 2025-09-09T13:43:13Z

Azure Pipelines successfully started running 2 pipeline(s).

ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 20, 2025

dotnet-policy-service bot assigned jakobbotsch Feb 20, 2025

Fix a check

a6014fc

jakobbotsch closed this Feb 20, 2025

jakobbotsch reopened this Feb 20, 2025

jakobbotsch added 4 commits February 22, 2025 23:45

Fix arm64 asserts

bae005d

Merge branch 'main' of github.com:dotnet/runtime into bit-extract-par…

c00a323

…ameters

Run jit-format

dfc5f85

Fix build

a93af10

tannergooding reviewed Feb 24, 2025

View reviewed changes

src/coreclr/jit/lower.cpp Outdated Show resolved Hide resolved

tannergooding reviewed Feb 24, 2025

View reviewed changes

src/coreclr/jit/lower.cpp Show resolved Hide resolved

Merge branch 'main' of github.com:dotnet/runtime into bit-extract-par…

0981c61

…ameters

This was referenced Feb 25, 2025

SPMI: Add fallback for notifyInstructionSetUsage #112925

Merged

JIT struct work planned for .NET 10 #109635

Closed

Handle SIMD12s

b6d1da9

jakobbotsch mentioned this pull request Feb 26, 2025

JIT: Avoid unnecessary write backs during physical promotion #112949

Closed

tannergooding approved these changes Feb 27, 2025

View reviewed changes

jakobbotsch mentioned this pull request Mar 5, 2025

JIT: Support bitwise field insertions for return registers #113178

Merged

github-actions bot locked and limited conversation to collaborators Aug 3, 2025

jakobbotsch added 3 commits August 15, 2025 16:01

JIT: Always create FIELD_LIST for struct args in physical promotion

93c0494

Run jit-format

88b688c

Merge branch 'main' of github.com:dotnet/runtime into physical-promot…

059b56c

…ed-call-arg-fields

jakobbotsch mentioned this pull request Aug 29, 2025

JIT: Always create FIELD_LIST for struct args in physical promotion #118778

Merged

jakobbotsch added 2 commits August 29, 2025 11:11

Merge branch 'main' of github.com:dotnet/runtime into bit-extract-par…

9cd1d99

…ameters

Merge branch 'physical-promoted-call-arg-fields' into bit-extract-par…

2862569

…ameters

Merge branch 'main' of github.com:dotnet/runtime into bit-extract-par…

0cc1b53

…ameters

jakobbotsch reopened this Sep 4, 2025

dotnet unlocked this conversation Sep 4, 2025

build-analysis bot mentioned this pull request Sep 4, 2025

Test failure: baseservices/exceptions/stackoverflow/stackoverflowtester/stackoverflowtester.cmd #110173

Open

jakobbotsch marked this pull request as ready for review September 5, 2025 12:57

Copilot AI review requested due to automatic review settings September 5, 2025 12:57

Copilot AI reviewed Sep 5, 2025

View reviewed changes

jakobbotsch requested a review from AndyAyersMS September 5, 2025 13:02

AndyAyersMS approved these changes Sep 5, 2025

View reviewed changes

jakobbotsch merged commit b1cd0f4 into dotnet:main Sep 10, 2025
138 of 140 checks passed

jakobbotsch deleted the bit-extract-parameters branch September 10, 2025 13:25

DrewScoggins mentioned this pull request Sep 16, 2025

[Perf] Linux/x64: 9 Regressions on 9/10/2025 11:38:35 AM +00:00 #119770

Open

github-actions bot locked and limited conversation to collaborators Oct 11, 2025

JIT: Support bitwise field extractions from parameter registers #112740

JIT: Support bitwise field extractions from parameter registers #112740

Uh oh!

Conversation

jakobbotsch commented Feb 20, 2025

Uh oh!

dotnet-policy-service bot commented Feb 20, 2025

Uh oh!

jakobbotsch commented Feb 24, 2025

Uh oh!

azure-pipelines bot commented Feb 24, 2025

Uh oh!

Uh oh!

Uh oh!

jakobbotsch commented Feb 26, 2025

Uh oh!

azure-pipelines bot commented Feb 26, 2025

Uh oh!

jakobbotsch commented Aug 29, 2025

Uh oh!

jakobbotsch commented Sep 4, 2025

Uh oh!

azure-pipelines bot commented Sep 5, 2025

Uh oh!

jakobbotsch commented Sep 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jakobbotsch commented Sep 5, 2025

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull Request Overview

Reviewed Changes

Uh oh!

jakobbotsch commented Sep 5, 2025

Uh oh!

jakobbotsch commented Sep 9, 2025

Uh oh!

jakobbotsch commented Sep 9, 2025

Uh oh!

jakobbotsch commented Sep 9, 2025

Uh oh!

azure-pipelines bot commented Sep 9, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

jakobbotsch commented Sep 5, 2025 •

edited

Loading