litian2025
diff --git a/‎src/jit/codegencommon.cpp‎
Lines changed: 50 additions & 20 deletions b/‎src/jit/codegencommon.cpp‎
Lines changed: 50 additions & 20 deletions
diff --git a/‎src/jit/codegenxarch.cpp‎
Lines changed: 14 additions & 0 deletions b/‎src/jit/codegenxarch.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/jit/compiler.cpp‎
Lines changed: 3 additions & 0 deletions b/‎src/jit/compiler.cpp‎
Lines changed: 3 additions & 0 deletions
@@ -10583,6 +10583,19 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
 //             funclet frames: this will be FuncletInfo.fiSpDelta.
 void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
 {
+#ifdef FEATURE_AVX_SUPPORT
+    bool bVzeroupperIssued = false;
+    // If the method contains AVX instruction, issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
+    // This is to avoid AVX/SSE transition penalty since in Reverse PInvoke, native code that contains Legacy SSE
+    // instruction can call into this managed code that contains 128bit or 256bit AVX instruction
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
+        getEmitter()->ContainsAVX())
+    {
+        instGen(INS_vzeroupper);
+        bVzeroupperIssued = true;
+    }
+#endif
+
     regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
 
     // Only callee saved floating point registers should be in regMask
@@ -10611,6 +10624,16 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
         regMaskTP regBit = genRegMask(reg);
         if ((regBit & regMask) != 0)
         {
+#ifdef FEATURE_AVX_SUPPORT
+            // Vzeroupper needs to be issued before copyIns (vmovupd etc.) that are AVX instructions
+            if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
+                !bVzeroupperIssued &&
+				getEmitter()->IsAVXInstruction(copyIns))
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+#endif
             // ABI requires us to preserve lower 128-bits of YMM register.
             getEmitter()->emitIns_AR_R(copyIns,
                                        EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
@@ -10621,16 +10644,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
             offset -= XMM_REGSIZE_BYTES;
         }
     }
-
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
 }
 
 // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
@@ -10651,6 +10664,19 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     // fast path return
     if (regMask == RBM_NONE)
     {
+#ifdef FEATURE_AVX_SUPPORT
+        // Before return, check if the method contains 256bit AVX instruction, issue a Vzeroupper to zero out 
+        // upper 128-bits of all YMM regs. This is to avoid AVX-256 to legacy SSE transition penalty.
+        if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
+            getEmitter()->Contains256bitAVX())
+        {
+            instGen(INS_vzeroupper);
+        }
+
+        // reset both AVX and 256bit AVX flags for the function before return
+        getEmitter()->SetContainsAVX(false);
+        getEmitter()->SetContains256bitAVX(false);
+#endif
         return;
     }
 
@@ -10682,16 +10708,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
     assert((offset % 16) == 0);
 #endif // _TARGET_AMD64_
 
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
-
     for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
     {
         regMaskTP regBit = genRegMask(reg);
@@ -10706,6 +10722,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
             offset -= XMM_REGSIZE_BYTES;
         }
     }
+
+#ifdef FEATURE_AVX_SUPPORT
+    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
+    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
+    // using SSE2.
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX &&
+        getEmitter()->Contains256bitAVX())
+    {
+        instGen(INS_vzeroupper);
+    }
+    // reset both AVX and 256bit AVX flags for the function before return
+    getEmitter()->SetContainsAVX(false);
+    getEmitter()->SetContains256bitAVX(false);
+#endif
 }
 #endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
 
 
@@ -5001,6 +5001,20 @@ void CodeGen::genCallInstruction(GenTreePtr node)
 
 #endif // defined(_TARGET_X86_)
 
+#ifdef FEATURE_AVX_SUPPORT
+    // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
+	// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
+	// transition penalty, assume the user function may contains legacy SSE instruction
+    if (call->IsPInvoke() && call->gtCallType == CT_USER_FUNC && compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        if (getEmitter()->Contains256bitAVX())
+        {
+            instGen(INS_vzeroupper);
+        }
+    }
+#endif
+
+
     if (target != nullptr)
     {
 #ifdef _TARGET_X86_
 
@@ -2310,6 +2310,9 @@ void Compiler::compSetProcessor()
         if (opts.compCanUseAVX)
         {
             codeGen->getEmitter()->SetUseAVX(true);
+            // Assume each JITted method does not contain AVX instruction at first
+            codeGen->getEmitter()->SetContainsAVX(false);
+            codeGen->getEmitter()->SetContains256bitAVX(false);
         }
         else
 #endif // FEATURE_AVX_SUPPORT
Original file line number	Diff line number	Diff line change
`@@ -2310,6 +2310,9 @@ void Compiler::compSetProcessor()`
`2310`	`2310`	`if (opts.compCanUseAVX)`
`2311`	`2311`	`{`
`2312`	`2312`	`codeGen->getEmitter()->SetUseAVX(true);`
	`2313`	`+ // Assume each JITted method does not contain AVX instruction at first`
	`2314`	`+ codeGen->getEmitter()->SetContainsAVX(false);`
	`2315`	`+ codeGen->getEmitter()->SetContains256bitAVX(false);`
`2313`	`2316`	`}`
`2314`	`2317`	`else`
`2315`	`2318`	`#endif // FEATURE_AVX_SUPPORT`