Skip to content

Conversation

jakobbotsch
Copy link
Member

Recent work now allows us to finally add support for the backend to extract fields out of parameters without spilling them to stack. Previously this was only supported when the fields mapped cleanly to registers.

A win-x64 example:

static int Foo(int? foo)
{
    return foo.HasValue ? foo.Value : 0;
}
 ; Method Program:Foo(System.Nullable`1[int]):int (FullOpts)
 G_M19236_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=0.50 PerfScore 0.50
+						;; size=0 bbWeight=0.50 PerfScore 0.00

-G_M19236_IG02:  ;; offset=0x0005
-       movzx    rcx, cl
-       xor      eax, eax
-       test     ecx, ecx
-       cmovne   eax, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=0.50 PerfScore 1.38
+G_M19236_IG02:  ;; offset=0x0000
+       movzx    rax, cl
+       shr      rcx, 32
+       xor      edx, edx
+       test     eax, eax
+       mov      eax, edx
+       cmovne   eax, ecx
+						;; size=16 bbWeight=0.50 PerfScore 0.88

-G_M19236_IG03:  ;; offset=0x0011
+G_M19236_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=0.50 PerfScore 0.50
-; Total bytes of code: 18
-
+; Total bytes of code: 17

Another win-x64 example:

static float Sum(PointF p)
{
    return p.X + p.Y;
}
 ; Method Program:Sum(System.Drawing.PointF):float (FullOpts)
 G_M48891_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=1 PerfScore 1.00
+						;; size=0 bbWeight=1 PerfScore 0.00

-G_M48891_IG02:  ;; offset=0x0005
-       vmovss   xmm0, dword ptr [rsp+0x08]
-       vaddss   xmm0, xmm0, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=1 PerfScore 8.00
+G_M48891_IG02:  ;; offset=0x0000
+       vmovd    xmm0, ecx
+       shr      rcx, 32
+       vmovd    xmm1, ecx
+       vaddss   xmm0, xmm0, xmm1
+						;; size=16 bbWeight=1 PerfScore 7.50

-G_M48891_IG03:  ;; offset=0x0011
+G_M48891_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code: 18
+; Total bytes of code: 17

An arm64 example:

static bool Test(Memory<int> mem)
{
    return mem.Length > 10;
}
 ; Method Program:Test(System.Memory`1[int]):ubyte (FullOpts)
 G_M53448_IG01:  ;; offset=0x0000
-            stp     fp, lr, [sp, #-0x20]!
+            stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
-            stp     x0, x1, [fp, #0x10]	// [V00 arg0], [V00 arg0+0x08]
-						;; size=12 bbWeight=1 PerfScore 2.50
+						;; size=8 bbWeight=1 PerfScore 1.50

-G_M53448_IG02:  ;; offset=0x000C
-            ldr     w0, [fp, #0x1C]	// [V00 arg0+0x0c]
+G_M53448_IG02:  ;; offset=0x0008
+            lsr     x0, x1, #32
             cmp     w0, #10
             cset    x0, gt
-						;; size=12 bbWeight=1 PerfScore 3.00
+						;; size=12 bbWeight=1 PerfScore 2.00

-G_M53448_IG03:  ;; offset=0x0018
-            ldp     fp, lr, [sp], #0x20
+G_M53448_IG03:  ;; offset=0x0014
+            ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 32
+; Total bytes of code: 28

Float -> float extractions that do not map cleanly is still not supported, but should be doable (via vector register extractions). Float -> int extractions are not supported, but I'm not sure we see these.

This is often not a code size improvement, but typically a perfscore improvement. Also this seems to have some bad interactions with call arguments since they do not yet support something similar, but hopefully that can be improved separately.

This should fix a number of issues that I need to go find.

Recent work now allows us to finally add support for the backend to
extract fields out of parameters without spilling them to stack.
Previously this was only supported when the fields mapped cleanly to
registers.

A win-x64 example:
```csharp
static int Foo(int? foo)
{
    return foo.HasValue ? foo.Value : 0;
}
```
```diff
 ; Method Program:Foo(System.Nullable`1[int]):int (FullOpts)
 G_M19236_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=0.50 PerfScore 0.50
+						;; size=0 bbWeight=0.50 PerfScore 0.00

-G_M19236_IG02:  ;; offset=0x0005
-       movzx    rcx, cl
-       xor      eax, eax
-       test     ecx, ecx
-       cmovne   eax, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=0.50 PerfScore 1.38
+G_M19236_IG02:  ;; offset=0x0000
+       movzx    rax, cl
+       shr      rcx, 32
+       xor      edx, edx
+       test     eax, eax
+       mov      eax, edx
+       cmovne   eax, ecx
+						;; size=16 bbWeight=0.50 PerfScore 0.88

-G_M19236_IG03:  ;; offset=0x0011
+G_M19236_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=0.50 PerfScore 0.50
-; Total bytes of code: 18
-
+; Total bytes of code: 17
```

Another win-x64 example:
```csharp
static float Sum(PointF p)
{
    return p.X + p.Y;
}
```

```diff
 ; Method Program:Sum(System.Drawing.PointF):float (FullOpts)
 G_M48891_IG01:  ;; offset=0x0000
-       mov      qword ptr [rsp+0x08], rcx
-						;; size=5 bbWeight=1 PerfScore 1.00
+						;; size=0 bbWeight=1 PerfScore 0.00

-G_M48891_IG02:  ;; offset=0x0005
-       vmovss   xmm0, dword ptr [rsp+0x08]
-       vaddss   xmm0, xmm0, dword ptr [rsp+0x0C]
-						;; size=12 bbWeight=1 PerfScore 8.00
+G_M48891_IG02:  ;; offset=0x0000
+       vmovd    xmm0, ecx
+       shr      rcx, 32
+       vmovd    xmm1, ecx
+       vaddss   xmm0, xmm0, xmm1
+						;; size=16 bbWeight=1 PerfScore 7.50

-G_M48891_IG03:  ;; offset=0x0011
+G_M48891_IG03:  ;; offset=0x0010
        ret
 						;; size=1 bbWeight=1 PerfScore 1.00
-; Total bytes of code: 18
+; Total bytes of code: 17
```

An arm64 example:
```csharp
static bool Test(Memory<int> mem)
{
    return mem.Length > 10;
}
```

```diff
 ; Method Program:Test(System.Memory`1[int]):ubyte (FullOpts)
 G_M53448_IG01:  ;; offset=0x0000
-            stp     fp, lr, [sp, #-0x20]!
+            stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
-            stp     x0, x1, [fp, #0x10]	// [V00 arg0], [V00 arg0+0x08]
-						;; size=12 bbWeight=1 PerfScore 2.50
+						;; size=8 bbWeight=1 PerfScore 1.50

-G_M53448_IG02:  ;; offset=0x000C
-            ldr     w0, [fp, #0x1C]	// [V00 arg0+0x0c]
+G_M53448_IG02:  ;; offset=0x0008
+            lsr     x0, x1, dotnet#32
             cmp     w0, dotnet#10
             cset    x0, gt
-						;; size=12 bbWeight=1 PerfScore 3.00
+						;; size=12 bbWeight=1 PerfScore 2.00

-G_M53448_IG03:  ;; offset=0x0018
-            ldp     fp, lr, [sp], #0x20
+G_M53448_IG03:  ;; offset=0x0014
+            ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-; Total bytes of code: 32
+; Total bytes of code: 28
```
@ghost ghost added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 20, 2025
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

@jakobbotsch
Copy link
Member Author

/azp run runtime-coreclr superpmi-replay, runtime-coreclr superpmi-diffs

Copy link

Azure Pipelines successfully started running 2 pipeline(s).

@jakobbotsch
Copy link
Member Author

/azp run runtime-coreclr superpmi-diffs

Copy link

Azure Pipelines successfully started running 1 pipeline(s).

jakobbotsch added a commit that referenced this pull request Mar 8, 2025
Based on the new `FIELD_LIST` support for returns this PR adds support for the
JIT to combine smaller fields via bitwise operations when returned, instead of
spilling these to stack.

win-x64 examples:
```csharp
static int? Test()
{
    return Environment.TickCount;
}
```

```diff
        call     System.Environment:get_TickCount():int
-       mov      dword ptr [rsp+0x24], eax
-       mov      byte  ptr [rsp+0x20], 1
-       mov      rax, qword ptr [rsp+0x20]
-						;; size=19 bbWeight=1 PerfScore 4.00
+       mov      eax, eax
+       shl      rax, 32
+       or       rax, 1
+						;; size=15 bbWeight=1 PerfScore 2.00
```
(the `mov eax, eax` is unnecessary, but not that simple to get rid of)

 ```csharp
static (int x, float y) Test(int x, float y)
{
    return (x, y);
}
```

```diff
-       mov      dword ptr [rsp], ecx
-       vmovss   dword ptr [rsp+0x04], xmm1
-       mov      rax, qword ptr [rsp]
+       vmovd    eax, xmm1
+       shl      rax, 32
+       mov      ecx, ecx
+       or       rax, rcx
 						;; size=13 bbWeight=1 PerfScore 3.00
```

An arm64 example:
```csharp
static Memory<int> ToMemory(int[] arr)
{
    return arr.AsMemory();
}
```

```diff
 G_M45070_IG01:  ;; offset=0x0000
-            stp     fp, lr, [sp, #-0x20]!
+            stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
-            str     xzr, [fp, #0x10]	// [V03 tmp2]
-						;; size=12 bbWeight=1 PerfScore 2.50
-G_M45070_IG02:  ;; offset=0x000C
+						;; size=8 bbWeight=1 PerfScore 1.50
+G_M45070_IG02:  ;; offset=0x0008
             cbz     x0, G_M45070_IG06
 						;; size=4 bbWeight=1 PerfScore 1.00
-G_M45070_IG03:  ;; offset=0x0010
-            str     x0, [fp, #0x10]	// [V07 tmp6]
-            str     wzr, [fp, #0x18]	// [V08 tmp7]
-            ldr     x0, [fp, #0x10]	// [V07 tmp6]
-            ldr     w0, [x0, #0x08]
-            str     w0, [fp, #0x1C]	// [V09 tmp8]
-						;; size=20 bbWeight=0.80 PerfScore 6.40
-G_M45070_IG04:  ;; offset=0x0024
-            ldp     x0, x1, [fp, #0x10]	// [V03 tmp2], [V03 tmp2+0x08]
-						;; size=4 bbWeight=1 PerfScore 3.00
-G_M45070_IG05:  ;; offset=0x0028
-            ldp     fp, lr, [sp], #0x20
+G_M45070_IG03:  ;; offset=0x000C
+            ldr     w1, [x0, #0x08]
+						;; size=4 bbWeight=0.80 PerfScore 2.40
+G_M45070_IG04:  ;; offset=0x0010
+            mov     w1, w1
+            mov     x2, xzr
+            orr     x1, x2, x1,  LSL #32
+						;; size=12 bbWeight=1 PerfScore 2.00
+G_M45070_IG05:  ;; offset=0x001C
+            ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-G_M45070_IG06:  ;; offset=0x0030
-            str     xzr, [fp, #0x10]	// [V07 tmp6]
-            str     xzr, [fp, #0x18]
+G_M45070_IG06:  ;; offset=0x0024
+            mov     x0, xzr
+            mov     w1, wzr
             b       G_M45070_IG04
-						;; size=12 bbWeight=0.20 PerfScore 0.60
+						;; size=12 bbWeight=0.20 PerfScore 0.40
```
(sneak peek -- this codegen requires some supplementary changes, and there's
additional opportunities here)

This is the return counterpart to #112740. That PR has a bunch of regressions
that makes it look like we need to support returns/call arguments first, before
we try to support parameters.

There's a few follow-ups here:
- Support for float->float insertions (when a float value needs to be returned
  as the 1st, 2nd, .... field of a SIMD register)
- Support for coalescing memory loads, particularly because the fields of the
  `FIELD_LIST` come from a promoted struct that ended up DNER. In those cases we
  should be able to recombine the fields back to a single large field, instead
  of combining them with bitwise operations.
- Support for constant folding the bitwise insertions. This requires some more
  constant folding support in lowering.
- The JIT has lots of (now outdated) restrictions based around multi-reg returns
  that get in the way. Lifting these should improve things considerably.
@github-actions github-actions bot locked and limited conversation to collaborators Aug 3, 2025
@jakobbotsch
Copy link
Member Author

Spot checking some regressions with #118778 included:

27865.dasm - Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)

@@ -20,7 +20,9 @@
 ;* V10 tmp7         [V10    ] (  0,  0   )     int  ->  zero-ref    "field V05.value (fldOffset=0x4)" P-INDEP
 ;* V11 tmp8         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "field V07.hasValue (fldOffset=0x0)" P-INDEP
 ;* V12 tmp9         [V12    ] (  0,  0   )     int  ->  zero-ref    "field V07.value (fldOffset=0x4)" P-INDEP
-;  V13 rat0         [V13,T02] (  3,  3   )    long  ->  rdx         "V01.rdx"
+;  V13 tmp10        [V13,T04] (  2,  2   )   ubyte  ->  rax         "V01.[000..001)"
+;  V14 tmp11        [V14,T05] (  2,  2   )     int  ->  rdx         "V01.[004..008)"
+;  V15 rat0         [V15,T02] (  4,  4   )    long  ->  rdx         "V01.rdx"
 ;
 ; Lcl frame size = 0
 
@@ -28,19 +30,24 @@ G_M3648_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M3648_IG02:        ; bbWeight=1, gcrefRegs=0002 {rcx}, byrefRegs=0000 {}, byref
        ; gcrRegs +[rcx]
-       mov      eax, 1
-       mov      r10d, 2
+       movzx    rax, dl
+       shr      rdx, 32
+       mov      r10d, 1
+       mov      r9d, 2
        test     r8b, r8b
-       mov      r8d, r10d
-       cmovne   r8d, eax
+       mov      r8d, r9d
+       cmovne   r8d, r10d
+       mov      edx, edx
+       shl      rdx, 32
+       or       rdx, rax
        mov      rax, qword ptr [rcx]
        mov      rax, qword ptr [rax+0x60]
-						;; size=28 bbWeight=1 PerfScore 5.25
+						;; size=45 bbWeight=1 PerfScore 7.00
 G_M3648_IG03:        ; bbWeight=1, epilog, nogc, extend
        tail.jmp [rax+0x38]<unknown method>
 						;; size=4 bbWeight=1 PerfScore 4.00
 
-; Total bytes of code 32, prolog size 0, PerfScore 9.25, instruction count 8, allocated bytes for code 32 (MethodHash=90bdf1bf) for method Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)
+; Total bytes of code 49, prolog size 0, PerfScore 11.00, instruction count 13, allocated bytes for code 49 (MethodHash=90bdf1bf) for method Microsoft.EntityFrameworkCore.Metadata.Internal.InternalForeignKeyBuilder:Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder.OnDelete(System.Nullable`1[int],bool):Microsoft.EntityFrameworkCore.Metadata.Builders.IConventionForeignKeyBuilder:this (FullOpts)
 ; ============================================================

We end up promoting a Nullable<int> into two fields even thought it is only used as a parameter/argument. That happens because it gets assigned to a local in the function that is promoted into those two fields by old promotion.

Previously we did not promote it and end up copy propagating it such that the final call in this function takes LCL_FLD<long>(V01). The backend ends up treating the parameter as a single promoted register, which is better.

This should eventually be better: the idea is that once old promotion is removed there is no point in promoting this local as a bool + int; instead ABI induced promotion will result in the same promotion as a single long and then we will get the same codegen as before. It requires some more work that will happen in .NET 11.

@jakobbotsch
Copy link
Member Author

/azp run runtime-coreclr superpmi-diffs

Copy link

Azure Pipelines successfully started running 1 pipeline(s).

@jakobbotsch
Copy link
Member Author

jakobbotsch commented Sep 5, 2025

446496.dasm - System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)

@@ -11,15 +11,16 @@
 ;
 ;* V00 arg0         [V00    ] (  0,  0   )   byref  ->  zero-ref    ld-addr-op single-def
 ;* V01 arg1         [V01    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op single-def <System.Nullable`1[float]>
-;  V02 arg2         [V02,T01] (  3,  2   )  struct ( 8) [rsp+0x18]  do-not-enreg[SF] ld-addr-op single-def <System.Nullable`1[System.Half]>
+;  V02 arg2         [V02,T00] (  4,  2   )  struct ( 8) [rsp+0x18]  do-not-enreg[SF] ld-addr-op single-def <System.Nullable`1[System.Half]>
 ;* V03 loc0         [V03    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op <System.Nullable`1[System.Half]>
 ;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V05 tmp1         [V05    ] (  0,  0   )   ubyte  ->  zero-ref    "field V00.hasValue (fldOffset=0x0)" P-INDEP
 ;* V06 tmp2         [V06    ] (  0,  0   )  double  ->  zero-ref    "field V00.value (fldOffset=0x8)" P-INDEP
 ;* V07 tmp3         [V07    ] (  0,  0   )   ubyte  ->  zero-ref    single-def "field V03.hasValue (fldOffset=0x0)" P-INDEP
 ;* V08 tmp4         [V08    ] (  0,  0   )  ushort  ->  zero-ref    single-def "field V03.value (fldOffset=0x2)" P-INDEP
-;* V09 tmp5         [V09    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref" <System.Nullable`1[double]>
-;  V10 rat0         [V10,T00] (  5,  5   )   ubyte  ->   r8         "V02.r8"
+;  V09 tmp5         [V09,T02] (  3,  2   )   ubyte  ->   r8         single-def "V02.[000..001)"
+;* V10 tmp6         [V10    ] (  0,  0   )  struct (16) zero-ref    "Promoted implicit byref" <System.Nullable`1[double]>
+;  V11 rat0         [V11,T01] (  3,  3   )    long  ->   r8         "V02.r8"
 ;
 ; Lcl frame size = 0
 
@@ -36,13 +37,14 @@ G_M4475_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 G_M4475_IG04:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
+       mov      byte  ptr [rsp+0x18], r8b
        mov      eax, dword ptr [rsp+0x18]
-						;; size=4 bbWeight=0 PerfScore 0.00
+						;; size=9 bbWeight=0 PerfScore 0.00
 G_M4475_IG05:        ; bbWeight=0, epilog, nogc, extend
        ret      
 						;; size=1 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 22, prolog size 0, PerfScore 3.75, instruction count 8, allocated bytes for code 22 (MethodHash=3ebaee84) for method System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)
+; Total bytes of code 27, prolog size 0, PerfScore 3.75, instruction count 9, allocated bytes for code 27 (MethodHash=3ebaee84) for method System.Numerics.Tensors.Tests.Helpers:DetermineTolerance[System.Half](System.Nullable`1[double],System.Nullable`1[float],System.Nullable`1[System.Half]):System.Nullable`1[System.Half] (Tier1)
 ; ============================================================
 
 Unwind Info:

In this one we now end up promoting the _hasValue field of an incoming parameter nullable, but we do not promote the _value. We then later return it (as a LCL_FLD<nullable>), which means we now have to write the _hasValue field back before that return.

Two things we can do here:

  1. Running physical promotion in RPO we can sometimes (like in this case) keep fields considered up-to-date in their stack home. It would get rid of the stack store.
  2. Once we have promoted _hasValue, we can realize that it might be a good idea to promote _value as well, to avoid a partially promoted local. ABI considerations may play into this decision.

@jakobbotsch
Copy link
Member Author

System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)

@@ -12,15 +12,15 @@
 ;  V00 arg0         [V00,T02] (  6,  4.16)     ref  ->  rdi         class-hnd single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[]>
 ;  V01 arg1         [V01,T00] (  5,  5   )     int  ->  rbx         single-def
 ;  V02 arg2         [V02,T01] (  5,  5   )     int  ->  rsi         single-def
-;  V03 arg3         [V03,T04] (  4,  2.08)  struct ( 8) [rsp+0x78]  do-not-enreg[SF] single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
-;  V04 arg4         [V04,T12] (  2,  0.08)     ref  ->  [rsp+0x80]  class-hnd single-def <System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]>
-;  V05 loc0         [V05,T03] (  8,  4.30)     int  ->  rbx        
+;* V03 arg3         [V03    ] (  0,  0   )  struct ( 8) zero-ref    do-not-enreg[SF] single-def <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
+;  V04 arg4         [V04,T14] (  2,  0.08)     ref  ->  [rsp+0x90]  class-hnd single-def <System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]>
+;  V05 loc0         [V05,T04] (  8,  4.30)     int  ->  rbx        
 ;  V06 loc1         [V06,T05] (  5,  2.17)     int  ->  rsi        
-;  V07 loc2         [V07,T07] (  6,  0.32)     int  ->  rbp        
+;  V07 loc2         [V07,T10] (  6,  0.32)     int  ->  r13        
 ;* V08 loc3         [V08    ] (  0,  0   )     int  ->  zero-ref   
 ;  V09 OutArgs      [V09    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
-;* V10 tmp1         [V10,T15] (  0,  0   )     int  ->  zero-ref   
-;  V11 tmp2         [V11,T08] (  4,  0.24)     int  ->  r15         "guarded devirt return temp"
+;* V10 tmp1         [V10,T17] (  0,  0   )     int  ->  zero-ref   
+;  V11 tmp2         [V11,T11] (  4,  0.24)     int  ->  rax         "guarded devirt return temp"
 ;* V12 tmp3         [V12    ] (  0,  0   )  struct ( 8) zero-ref    "guarded devirt arg temp" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
 ;* V13 tmp4         [V13    ] (  0,  0   )     ref  ->  zero-ref    class-hnd exact "guarded devirt this exact temp" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap+PositionComparer>
 ;* V14 tmp5         [V14    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
@@ -30,39 +30,50 @@
 ;* V18 tmp9         [V18    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op "Inlining Arg" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
 ;* V19 tmp10        [V19    ] (  0,  0   )     int  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
 ;* V20 tmp11        [V20    ] (  0,  0   )  struct ( 8) zero-ref    ld-addr-op "Inlining Arg" <Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState>
-;  V21 tmp12        [V21,T11] (  4,  0.16)     int  ->  r15         "Inline return value spill temp"
+;  V21 tmp12        [V21,T13] (  4,  0.16)     int  ->  rax         "Inline return value spill temp"
 ;* V22 tmp13        [V22    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V23 tmp14        [V23,T09] (  4,  0.18)     int  ->  rcx         "field V12.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
-;  V24 tmp15        [V24,T13] (  2,  0.08)   ubyte  ->   r8         "field V12.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
-;  V25 tmp16        [V25,T14] (  2,  0.08)   ubyte  ->  rdx         "field V12.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
+;  V23 tmp14        [V23,T12] (  4,  0.18)     int  ->   r8         "field V12.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
+;  V24 tmp15        [V24,T15] (  2,  0.08)   ubyte  ->  rcx         "field V12.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
+;  V25 tmp16        [V25,T16] (  2,  0.08)   ubyte  ->  rdx         "field V12.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
 ;* V26 tmp17        [V26    ] (  0,  0   )     int  ->  zero-ref    "field V18.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
 ;* V27 tmp18        [V27    ] (  0,  0   )   ubyte  ->  zero-ref    "field V18.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
 ;* V28 tmp19        [V28    ] (  0,  0   )   ubyte  ->  zero-ref    "field V18.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
-;  V29 tmp20        [V29,T10] (  3,  0.18)     int  ->   r8         "field V20.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
+;* V29 tmp20        [V29    ] (  0,  0   )     int  ->  zero-ref    "field V20.<Position>k__BackingField (fldOffset=0x0)" P-INDEP
 ;* V30 tmp21        [V30    ] (  0,  0   )   ubyte  ->  zero-ref    "field V20.<WarningsState>k__BackingField (fldOffset=0x4)" P-INDEP
 ;* V31 tmp22        [V31    ] (  0,  0   )   ubyte  ->  zero-ref    "field V20.<AnnotationsState>k__BackingField (fldOffset=0x5)" P-INDEP
-;  V32 tmp23        [V32,T06] (  4,  0.63)   byref  ->  rdx         "BlockOp address local"
+;  V32 tmp23        [V32,T06] (  4,  1.10)     int  ->  rbp         "V03.[000..004)"
+;  V33 tmp24        [V33,T07] (  2,  1   )   ubyte  ->  r14         "V03.[004..005)"
+;  V34 tmp25        [V34,T08] (  2,  1   )   ubyte  ->  r15         "V03.[005..006)"
+;  V35 tmp26        [V35,T09] (  4,  0.63)   byref  ->  rdx         "BlockOp address local"
+;  V36 rat0         [V36,T03] (  5,  5   )    long  ->   r9         "V03.r9"
 ;
 ; Lcl frame size = 40
 
 G_M8241_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r15
        push     r14
+       push     r13
+       push     r12
        push     rdi
        push     rsi
        push     rbp
        push     rbx
        sub      rsp, 40
-       mov      qword ptr [rsp+0x78], r9
        mov      rdi, rcx
        ; gcrRegs +[rdi]
        mov      ebx, edx
        mov      esi, r8d
-						;; size=25 bbWeight=1 PerfScore 8.00
+						;; size=24 bbWeight=1 PerfScore 9.00
 G_M8241_IG02:        ; bbWeight=1, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, byref, isz
+       mov      ebp, r9d
+       mov      r14, r9
+       shr      r14, 32
+       movzx    r14, r14b
+       shr      r9, 40
+       movzx    r15, r9b
        test     rdi, rdi
        jne      SHORT G_M8241_IG04
-						;; size=5 bbWeight=1 PerfScore 1.25
+						;; size=27 bbWeight=1 PerfScore 3.25
 G_M8241_IG03:        ; bbWeight=0.80, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, byref
        mov      rcx, 0xD1FFAB1E
        ; gcrRegs +[rcx]
@@ -106,105 +117,113 @@ G_M8241_IG08:        ; bbWeight=1, epilog, nogc, extend
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r12
+       pop      r13
        pop      r14
        pop      r15
        ret      
-						;; size=13 bbWeight=1 PerfScore 4.25
+						;; size=17 bbWeight=1 PerfScore 5.25
 G_M8241_IG09:        ; bbWeight=0.08, gcVars=0000000000000000 {}, gcrefRegs=0080 {rdi}, byrefRegs=0000 {}, gcvars, byref, isz
        ; gcrRegs +[rdi]
-       mov      ebp, esi
-       sub      ebp, ebx
-       sar      ebp, 1
-       add      ebp, ebx
-       cmp      ebp, dword ptr [rdi+0x08]
+       mov      r13d, esi
+       sub      r13d, ebx
+       sar      r13d, 1
+       add      r13d, ebx
+       cmp      r13d, dword ptr [rdi+0x08]
        jae      G_M8241_IG20
-       mov      edx, ebp
+       mov      edx, r13d
        lea      rdx, bword ptr [rdi+8*rdx+0x10]
        ; byrRegs +[rdx]
-       mov      ecx, dword ptr [rdx]
-       movzx    r8, byte  ptr [rdx+0x04]
+       mov      r8d, dword ptr [rdx]
+       movzx    rcx, byte  ptr [rdx+0x04]
        movzx    rdx, byte  ptr [rdx+0x05]
        ; byrRegs -[rdx]
        mov      r11, 0xD1FFAB1E      ; Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap+PositionComparer
-       mov      r14, gword ptr [rsp+0x80]
-       ; gcrRegs +[r14]
-       cmp      qword ptr [r14], r11
+       mov      r12, gword ptr [rsp+0x90]
+       ; gcrRegs +[r12]
+       cmp      qword ptr [r12], r11
        jne      SHORT G_M8241_IG17
-       mov      r8d, dword ptr [rsp+0x78]
-       cmp      ecx, r8d
+       cmp      r8d, ebp
        jge      SHORT G_M8241_IG14
-						;; size=68 bbWeight=0.08 PerfScore 1.58
-G_M8241_IG10:        ; bbWeight=0.06, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref
-       mov      r15d, -1
-						;; size=6 bbWeight=0.06 PerfScore 0.01
-G_M8241_IG11:        ; bbWeight=0.08, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       test     r15d, r15d
+						;; size=70 bbWeight=0.08 PerfScore 1.50
+G_M8241_IG10:        ; bbWeight=0.06, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref
+       mov      eax, -1
+						;; size=5 bbWeight=0.06 PerfScore 0.01
+G_M8241_IG11:        ; bbWeight=0.08, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       test     eax, eax
        je       SHORT G_M8241_IG18
-       test     r15d, r15d
+       test     eax, eax
        jge      SHORT G_M8241_IG15
-						;; size=10 bbWeight=0.08 PerfScore 0.20
-G_M8241_IG12:        ; bbWeight=0.06, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref
-       lea      ebx, [rbp+0x01]
-						;; size=3 bbWeight=0.06 PerfScore 0.03
-G_M8241_IG13:        ; bbWeight=0.08, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
+						;; size=8 bbWeight=0.08 PerfScore 0.20
+G_M8241_IG12:        ; bbWeight=0.06, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref
+       lea      ebx, [r13+0x01]
+						;; size=4 bbWeight=0.06 PerfScore 0.03
+G_M8241_IG13:        ; bbWeight=0.08, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
        cmp      ebx, esi
-       mov      gword ptr [rsp+0x80], r14
+       mov      gword ptr [rsp+0x90], r12
        jle      SHORT G_M8241_IG09
        jmp      SHORT G_M8241_IG07
 						;; size=14 bbWeight=0.08 PerfScore 0.34
-G_M8241_IG14:        ; bbWeight=0.02, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       cmp      ecx, r8d
+G_M8241_IG14:        ; bbWeight=0.02, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       cmp      r8d, ebp
        jle      SHORT G_M8241_IG16
-       mov      r15d, 1
+       mov      eax, 1
        jmp      SHORT G_M8241_IG11
-						;; size=13 bbWeight=0.02 PerfScore 0.07
-G_M8241_IG15:        ; bbWeight=0.01, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       lea      esi, [rbp-0x01]
+						;; size=12 bbWeight=0.02 PerfScore 0.07
+G_M8241_IG15:        ; bbWeight=0.01, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       lea      esi, [r13-0x01]
        jmp      SHORT G_M8241_IG13
-						;; size=5 bbWeight=0.01 PerfScore 0.04
-G_M8241_IG16:        ; bbWeight=0, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       xor      r15d, r15d
+						;; size=6 bbWeight=0.01 PerfScore 0.04
+G_M8241_IG16:        ; bbWeight=0, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
+       xor      eax, eax
        jmp      SHORT G_M8241_IG11
-						;; size=5 bbWeight=0 PerfScore 0.00
-G_M8241_IG17:        ; bbWeight=0, gcrefRegs=4080 {rdi r14}, byrefRegs=0000 {}, byref, isz
-       mov      r8d, r8d
-       shl      r8, 32
+						;; size=4 bbWeight=0 PerfScore 0.00
+G_M8241_IG17:        ; bbWeight=0, gcrefRegs=1080 {rdi r12}, byrefRegs=0000 {}, byref, isz
        mov      ecx, ecx
-       or       r8, rcx
+       shl      rcx, 32
+       mov      r8d, r8d
+       or       rcx, r8
        mov      edx, edx
        shl      rdx, 40
-       or       rdx, r8
-       mov      rcx, r14
+       or       rdx, rcx
+       mov      r8d, r14d
+       shl      r8, 32
+       mov      ecx, ebp
+       or       r8, rcx
+       mov      ecx, r15d
+       shl      rcx, 40
+       or       r8, rcx
+       mov      rcx, r12
        ; gcrRegs +[rcx]
-       mov      r8, qword ptr [rsp+0x78]
        mov      r11, 0xD1FFAB1E      ; code for <unknown method>
        call     [r11]<unknown method>
        ; gcrRegs -[rcx]
        ; gcr arg pop 0
-       mov      r15d, eax
        jmp      SHORT G_M8241_IG11
-						;; size=47 bbWeight=0 PerfScore 0.00
+						;; size=61 bbWeight=0 PerfScore 0.00
 G_M8241_IG18:        ; bbWeight=0, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
-       ; gcrRegs -[rdi r14]
-       mov      eax, ebp
-						;; size=2 bbWeight=0 PerfScore 0.00
+       ; gcrRegs -[rdi r12]
+       mov      eax, r13d
+						;; size=3 bbWeight=0 PerfScore 0.00
 G_M8241_IG19:        ; bbWeight=0, epilog, nogc, extend
        add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
+       pop      r12
+       pop      r13
        pop      r14
        pop      r15
        ret      
-						;; size=13 bbWeight=0 PerfScore 0.00
+						;; size=17 bbWeight=0 PerfScore 0.00
 G_M8241_IG20:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        call     CORINFO_HELP_RNGCHKFAIL
        ; gcr arg pop 0
        int3     
 						;; size=6 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 314, prolog size 25, PerfScore 29.11, instruction count 99, allocated bytes for code 314 (MethodHash=5ad7dfce) for method System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)
+; Total bytes of code 357, prolog size 24, PerfScore 33.04, instruction count 114, allocated bytes for code 357 (MethodHash=5ad7dfce) for method System.Collections.Generic.ArraySortHelper`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]:InternalBinarySearch(Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState[],int,int,Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState,System.Collections.Generic.IComparer`1[Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState]):int (Tier1)
 ; ============================================================
 
 Unwind Info:
@@ -212,15 +231,17 @@ Unwind Info:
   >>   End offset   : 0xd1ffab1e (not in unwind data)
   Version           : 1
...

We now promote fields of a number of parameter structs, which leads to a bunch of bitwise extractions. It saves spilling in the prolog, but higher register pressure just leads to more callee saves.

This might improve with more intelligent ABI handling; currently those field promotions happen because we see assignments between old promoted locals and the parameter. If everything was left up to physical promotion, and we then had ABI based promotion, then we may have chosen to promote this in a way that does not require bitwise operations.

Overall I am not super worried about diffs like this, since the point of this PR is to get rid of spills in the general case by introducing bitwise operations to extract fields. I think in general that's the choice we want to make.

@jakobbotsch jakobbotsch marked this pull request as ready for review September 5, 2025 12:57
@Copilot Copilot AI review requested due to automatic review settings September 5, 2025 12:57
Copy link
Contributor

@Copilot Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request Overview

This PR adds support for bitwise field extractions from parameter registers, eliminating the need to spill parameters to stack when accessing struct fields. The implementation extends the JIT's ability to extract fields directly from parameter registers using bit operations like shifts and masks.

Key changes:

  • Enhanced parameter register mapping logic to support bitwise field extractions
  • Modified field access detection to allow non-exact register mappings
  • Added support for extracting fields at arbitrary offsets within parameter registers

Reviewed Changes

Copilot reviewed 2 out of 2 changed files in this pull request and generated no comments.

File Description
src/coreclr/jit/promotion.cpp Updated parameter register mapping detection to support bitwise extractions instead of requiring exact field-to-register alignment
src/coreclr/jit/lower.cpp Replaced field access logic with comprehensive bitwise extraction implementation, including shift operations and type conversions

@jakobbotsch
Copy link
Member Author

cc @dotnet/jit-contrib PTAL @AndyAyersMS

Diffs, diffs with old promotion disabled.

Overall this is a size regression but perfscore improvement. That's expected since we are replacing spills (that are quite compact) with bitwise extraction sequences that are less so.

I analyzed some of the regressions above. Many of them I will aim to improve as part of .NET 11; I think a lot of improvements will come from removing old promotion such that physical promotion can be smarter about promoting register passed parameters and arguments. For example, a struct value only used as a parameter/argument should be promoted as primitives that map to the registers that it is passed in. Today we often promote these as fields that require bitwise operations to extract/assemble because we see assignments to old promoted locals.

@jakobbotsch
Copy link
Member Author

/ba-g Issue is known, but build analysis is not unblocking

@jakobbotsch
Copy link
Member Author

Hmm, realized I never ran stress on this, so will do that first.

@jakobbotsch
Copy link
Member Author

/azp run runtime-coreclr jitstress, runtime-coreclr libraries-jitstress

Copy link

Azure Pipelines successfully started running 2 pipeline(s).

@jakobbotsch jakobbotsch merged commit b1cd0f4 into dotnet:main Sep 10, 2025
138 of 140 checks passed
@jakobbotsch jakobbotsch deleted the bit-extract-parameters branch September 10, 2025 13:25
@github-actions github-actions bot locked and limited conversation to collaborators Oct 11, 2025
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants