Skip to content

JIT: loop interchange optimization #4358

@ghost

Description

@ghost

Source: http://stackoverflow.com/a/11303693/863980

From the link above it is evident that the following two methods are logically equal, due to Loop-Invariant code motion:

public static void UnHoisted(ref int[] data, int arraySize)
{
    long sum = 0;

    for (int i = 0; i < 100000; ++i)
    {
        for (int j = 0; j < arraySize; ++j)
        {
            if (data[j] >= 128)
                sum += data[j];
        }
    }
}

and

public static void Hoisted(ref int[] data, int arraySize)
{
    long sum = 0;

    for (int j = 0; j < arraySize; ++j)
    {
        if (data[j] >= 128)
        {
            sum += data[j] * 100000;
        }
    }
}

The driver (Main) method looks like:

public static void Main (string[] args)
{
    const int arraySize = 32768;
    int[] data = new int[arraySize];
    Random random = new Random();

    for (int c = 0; c < arraySize; ++c)
        data[c] = random.Next() % 256;

    UnHoisted(ref data, arraySize);
    // Hoisted(ref data, arraySize);
}

Diffing the produced disasm of UnHoisted and Hoisted methods results in:

diff --git a/c:/temp/UnHoisted.txt b/c:/temp/Hoisted.txt
index 9f25a6b..e4cbf57 100644
--- a/c:/temp/UnHoisted.txt
+++ b/c:/temp/Hoisted.txt
@@ -1,79 +1,66 @@
-; Assembly listing for method Program:UnHoisted(byref,int)
+; Assembly listing for method Program:Hoisted(byref,int)
 ; Emitting BLENDED_CODE for X64 CPU with AVX
 ; optimized code
 ; rsp based frame
 ; fully interruptible
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T04] (  4,  18  )   byref  ->  rcx        
-;  V01 arg1         [V01,T05] (  3,  18  )     int  ->  rdx        
-;  V02 loc0         [V02,T06] (  3,  17  )    long  ->  rax        
-;  V03 loc1         [V03,T07] (  4,  13  )     int  ->   r8        
-;  V04 loc2         [V04,T00] (  8,  66  )     int  ->  r10        
-;  V05 loc3         [V05,T01] (  6,  64  )    bool  ->   r9        
-;  V06 tmp0         [V06,T02] (  3,  48  )     ref  ->   r9        
-;  V07 tmp1         [V07,T03] (  3,  48  )     ref  ->   r9        
-;  V08 OutArgs      [V08    ] (  1,   1  )  lclBlk (32) [rsp+0x00]  
+;  V00 arg0         [V00,T05] (  4,   6  )   byref  ->  rcx        
+;  V01 arg1         [V01,T04] (  3,  10  )     int  ->  rdx        
+;  V02 loc0         [V02,T06] (  3,   5  )    long  ->  rax        
+;  V03 loc1         [V03,T00] (  8,  21  )     int  ->   r8        
+;  V04 loc2         [V04,T01] (  4,  20  )    bool  ->   r9        
+;  V05 tmp0         [V05,T02] (  3,  12  )     ref  ->   r9        
+;  V06 tmp1         [V06,T03] (  3,  12  )     ref  ->   r9        
+;  V07 OutArgs      [V07    ] (  1,   1  )  lclBlk (32) [rsp+0x00]  
 ;
 ; Lcl frame size = 40

-G_M4700_IG01:
+G_M9639_IG01:
        4883EC28             sub      rsp, 40

-G_M4700_IG02:
+G_M9639_IG02:
        33C0                 xor      rax, rax
        4533C0               xor      r8d, r8d
-       EB5D                 jmp      SHORT G_M4700_IG07
+       EB49                 jmp      SHORT G_M9639_IG05

-G_M4700_IG03:
-       4533D2               xor      r10d, r10d
-       EB45                 jmp      SHORT G_M4700_IG06
-
-G_M4700_IG04:
+G_M9639_IG03:
        4C8B09               mov      r9, gword ptr [rcx]
-       458B5908             mov      r11d, dword ptr [r9+8]
-       453BD3               cmp      r10d, r11d
-       7365                 jae      SHORT G_M4700_IG09
-       4D63DA               movsxd   r11d, r10d
-       43817C991080000000   cmp      dword ptr [r9+4*r11+16], 128
+       458B5108             mov      r10d, dword ptr [r9+8]
+       453BC2               cmp      r8d, r10d
+       7352                 jae      SHORT G_M9639_IG07
+       4D63D0               movsxd   r10d, r8d
+       43817C911080000000   cmp      dword ptr [r9+4*r10+16], 128
        410F9CC1             setl     r9b
        450FB6C9             movzx    r9, r9b
        4585C9               test     r9d, r9d
-       751D                 jne      SHORT G_M4700_IG05
+       7521                 jne      SHORT G_M9639_IG04
        4C8B09               mov      r9, gword ptr [rcx]
-       458B5908             mov      r11d, dword ptr [r9+8]
-       453BD3               cmp      r10d, r11d
-       7340                 jae      SHORT G_M4700_IG09
-       4D63DA               movsxd   r11d, r10d
-       478B4C9910           mov      r9d, dword ptr [r9+4*r11+16]
+       458B5108             mov      r10d, dword ptr [r9+8]
+       453BC2               cmp      r8d, r10d
+       732D                 jae      SHORT G_M9639_IG07
+       4D63D0               movsxd   r10d, r8d
+       47694C9110A0860100   imul     r9d, dword ptr [r9+4*r10+16], 0x186A0
        4D63C9               movsxd   r9d, r9d
        4903C1               add      rax, r9

-G_M4700_IG05:
-       41FFC2               inc      r10d
-
-G_M4700_IG06:
-       443BD2               cmp      r10d, edx
-       410F9CC1             setl     r9b
-       450FB6C9             movzx    r9, r9b
-       4585C9               test     r9d, r9d
-       75AE                 jne      SHORT G_M4700_IG04
+G_M9639_IG04:
        41FFC0               inc      r8d

-G_M4700_IG07:
-       4181F8A0860100       cmp      r8d, 0x186A0
+G_M9639_IG05:
+       443BC2               cmp      r8d, edx
        410F9CC1             setl     r9b
        450FB6C9             movzx    r9, r9b
        4585C9               test     r9d, r9d
-       7592                 jne      SHORT G_M4700_IG03
+       75AA                 jne      SHORT G_M9639_IG03

-G_M4700_IG08:
+G_M9639_IG06:
        4883C428             add      rsp, 40
        C3                   ret      

-G_M4700_IG09:
-       E8D152265F           call     CORINFO_HELP_RNGCHKFAIL
+G_M9639_IG07:
+       E8E952245F           call     CORINFO_HELP_RNGCHKFAIL
        CC                   int3     

-; Total bytes of code 132, prolog size 4 for method Program:UnHoisted(byref,int)
+; Total bytes of code 108, prolog size 4 for method Program:Hoisted(byref,int)
 ; ============================================================

The execution time of UnHoisted() is much higher than that of Hoisted().

Do we have a starting point for this kind of optimization; such as some basic heuristics in place which needs to be enhanced or would it require an implementation from ground-up?

Cc @mikedn, @omariom, @cmckinsey

category:cq
theme:loop-opt
skill-level:expert
cost:extra-large

Metadata

Metadata

Assignees

No one assigned

    Labels

    area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIenhancementProduct code improvement that does NOT require public API changes/additionsoptimizationtenet-performancePerformance related issue

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions