From b92012c777e602d557124583651313a0afa33484 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Tue, 19 Sep 2023 09:44:33 +0200
Subject: [PATCH 01/46] Remove RC suffix

---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 44f2850b92d52..389bd8b6422da 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX rc)
+  set(LLVM_VERSION_SUFFIX)
 endif()
 
 if (NOT PACKAGE_VERSION)

From dba2a75e9c7ef81fe84774ba5eee5e67e01d801a Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 7 Mar 2024 21:27:31 -0800
Subject: [PATCH 02/46] Bump version to 18.1.1

---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 389bd8b6422da..ddf95cbc6c517 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -22,7 +22,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX)

From 2ad8fbdbca06843db18f450c8a141d6e022d20b5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 11 Mar 2024 07:31:28 -0700
Subject: [PATCH 03/46] Bump version to 18.1.2 (#84655)

---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index ddf95cbc6c517..c5fa66390bba8 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -22,7 +22,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 1)
+  set(LLVM_VERSION_PATCH 2)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX)

From 439e6f81e772956200aa797eab819b72bb64f84b Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 27 Feb 2024 18:10:53 +0100
Subject: [PATCH 04/46] [libc++][modules] Fixes naming inconsistency. (#83036)

The modules used is-standard-library and is-std-library. The latter is
the name used in the SG15 proposal,

Fixes: https://github.com/llvm/llvm-project/issues/82879
(cherry picked from commit b50bcc7ffb6ad6caa4c141a22915ab59f725b7ae)
---
 libcxx/modules/modules.json.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in
index ddc377f28f919..759ac92d81f18 100644
--- a/libcxx/modules/modules.json.in
+++ b/libcxx/modules/modules.json.in
@@ -5,7 +5,7 @@
     {
       "logical-name": "std",
       "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm",
-      "is-standard-library": true,
+      "is-std-library": true,
       "local-arguments": {
         "system-include-directories": [
           "@LIBCXX_MODULE_RELATIVE_PATH@"

From 340ba4588c8073f97b03fd5da9a4fd5dc3b27d2e Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Tue, 27 Feb 2024 05:08:58 +0800
Subject: [PATCH 05/46] MIPS: fix emitDirectiveCpsetup on N32 (#80534)

In gas, .cpsetup may expand to one of two code sequences (one is related to `__gnu_local_gp`), depending on -mno-shared and -msym32.
Since Clang doesn't support -mno-shared or -msym32, .cpsetup expands to one code sequence.
The N32 condition incorrectly leads to the incorrect `__gnu_local_gp` code sequence.

```
00000000 <t1>:
   0:   ffbc0008        sd      gp,8(sp)
   4:   3c1c0000        lui     gp,0x0
                        4: R_MIPS_HI16  __gnu_local_gp
   8:   279c0000        addiu   gp,gp,0
                        8: R_MIPS_LO16  __gnu_local_gp
```

Fixes: #52785
(cherry picked from commit 860b6edfa9b344fbf8c500c17158c8212ea87d1c)
---
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 12 +++--
 llvm/test/MC/Mips/cpsetup.s                   | 47 ++++++++++++++-----
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 27d7f0f261d10..adfcea7361583 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,7 +1255,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  if (getABI().IsN32()) {
+#if 0
+  // We haven't support -mabicalls -mno-shared yet.
+  if (-mno-shared) {
     MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
     const MipsMCExpr *HiExpr = MipsMCExpr::create(
         MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
@@ -1273,6 +1275,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
     return;
   }
+#endif
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
@@ -1288,8 +1291,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
           &STI);
 
-  // daddu  $gp, $gp, $funcreg
-  emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  // (d)addu  $gp, $gp, $funcreg
+  if (getABI().IsN32())
+    emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  else
+    emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 8e587aea3e7e6..4a027c6e796ae 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -4,8 +4,6 @@
 # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefixes=ASM,ASM-O32 %s
 
-# FIXME: Now we check .cpsetup expansion for `-mno-shared` case only.
-#        We also need to implement/check the `-mshared` case.
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump --no-print-imm-hex -d -r -z - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
@@ -35,11 +33,16 @@ t1:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -64,11 +67,16 @@ t2:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, $2, __cerror
@@ -101,11 +109,16 @@ t3:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: {{^ *0+}}38: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}40: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 .text
+# N32-NEXT: {{^ *0+}}40: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: {{^ *0+}}3c: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}44: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 .text
+# N32-NEXT: {{^ *0+}}44: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 # NXX-NEXT: nop
 # NXX-NEXT: sub $3, $3, $2
@@ -158,11 +171,16 @@ t5:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -184,11 +202,16 @@ IMM_8 = 8
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror

From 267d9b1a74c466fff6bde255a211732aa3bcd7e8 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Wed, 28 Feb 2024 13:03:35 -0800
Subject: [PATCH 06/46] Allow .alt_entry symbols to pass the .cfi nesting check
 (#82268)

A symbol with an `N_ALT_ENTRY` attribute may be defined in the middle of
a subsection, so it is reasonable to opt them out of the
`.cfi_{start,end}proc` nesting check.

Fixes: https://github.com/llvm/llvm-project/issues/82261
(cherry picked from commit 5b91647e3f82c9747c42c3239b7d7f3ade4542a7)
---
 llvm/lib/MC/MCParser/AsmParser.cpp            | 4 +++-
 llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 8e508dbdb1c69..026d252ec5bcd 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -44,6 +44,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
@@ -1950,7 +1951,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Lex();
     }
 
-    if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc && Sym->isExternal())
+    if (MAI.hasSubsectionsViaSymbols() && CFIStartProcLoc &&
+        Sym->isExternal() && !cast<MCSymbolMachO>(Sym)->isAltEntry())
       return Error(StartTokLoc, "non-private labels cannot appear between "
                                 ".cfi_startproc / .cfi_endproc pairs") &&
              Error(*CFIStartProcLoc, "previous .cfi_startproc was here");
diff --git a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
index 235b7d4480992..3a5af86defc59 100644
--- a/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
+++ b/llvm/test/MC/AArch64/cfi-bad-nesting-darwin.s
@@ -8,6 +8,10 @@
 	.p2align	2
 _locomotive:
 	.cfi_startproc
+	; An N_ALT_ENTRY symbol can be defined in the middle of a subsection, so
+	; these are opted out of the .cfi_{start,end}proc nesting check.
+	.alt_entry _engineer
+_engineer:
 	ret
 
 	; It is invalid to have a non-private label between .cfi_startproc and
@@ -17,7 +21,7 @@ _locomotive:
 	.p2align	2
 _caboose:
 ; DARWIN: [[#@LINE-1]]:1: error: non-private labels cannot appear between .cfi_startproc / .cfi_endproc pairs
-; DARWIN: [[#@LINE-10]]:2: error: previous .cfi_startproc was here
+; DARWIN: [[#@LINE-14]]:2: error: previous .cfi_startproc was here
 	ret
 	.cfi_endproc
 

From 16ab0812d2010dad76f87d4d50da8e79e0e75e71 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Wed, 28 Feb 2024 19:11:55 -0800
Subject: [PATCH 07/46] [clang][fat-lto-objects] Make module flags match
 non-FatLTO pipelines (#83159)

In addition to being rather hard to follow, there isn't a good reason
why FatLTO shouldn't just share the same code for setting module flags
for (Thin)LTO. This patch simplifies the logic and makes sure we use set
these flags in a consistent way, independent of FatLTO.

Additionally, we now test that output in the .llvm.lto section actually
matches the output from Full and Thin LTO compilation.

(cherry picked from commit 7d8b50aaab8e0f935e3cb1f3f397e98b9e3ee241)
---
 clang/lib/CodeGen/BackendUtil.cpp    | 32 ++++++++++++++--------------
 clang/test/CodeGen/fat-lto-objects.c | 21 +++++++++++++++++-
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7877e20d77f77..4f22d35f9d3a9 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -186,6 +186,14 @@ class EmitAssemblyHelper {
            TargetTriple.getVendor() != llvm::Triple::Apple;
   }
 
+  /// Check whether we should emit a flag for UnifiedLTO.
+  /// The UnifiedLTO module flag should be set when UnifiedLTO is enabled for
+  /// ThinLTO or Full LTO with module summaries.
+  bool shouldEmitUnifiedLTOModueFlag() const {
+    return CodeGenOpts.UnifiedLTO &&
+           (CodeGenOpts.PrepareForThinLTO || shouldEmitRegularLTOSummary());
+  }
+
 public:
   EmitAssemblyHelper(DiagnosticsEngine &_Diags,
                      const HeaderSearchOptions &HeaderSearchOpts,
@@ -1029,7 +1037,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
   if (!actionRequiresCodeGen(Action) && CodeGenOpts.VerifyModule)
     MPM.addPass(VerifierPass());
 
-  if (Action == Backend_EmitBC || Action == Backend_EmitLL) {
+  if (Action == Backend_EmitBC || Action == Backend_EmitLL ||
+      CodeGenOpts.FatLTO) {
     if (CodeGenOpts.PrepareForThinLTO && !CodeGenOpts.DisableLLVMPasses) {
       if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
         TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
@@ -1040,11 +1049,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
           if (!ThinLinkOS)
             return;
         }
-        if (CodeGenOpts.UnifiedLTO)
-          TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
         MPM.addPass(ThinLTOBitcodeWriterPass(
             *OS, ThinLinkOS ? &ThinLinkOS->os() : nullptr));
-      } else {
+      } else if (Action == Backend_EmitLL) {
         MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
                                     /*EmitLTOSummary=*/true));
       }
@@ -1058,24 +1065,17 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
         if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
           TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
                                    uint32_t(1));
-        if (CodeGenOpts.UnifiedLTO)
-          TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
       }
-      if (Action == Backend_EmitBC)
+      if (Action == Backend_EmitBC) {
         MPM.addPass(BitcodeWriterPass(*OS, CodeGenOpts.EmitLLVMUseLists,
                                       EmitLTOSummary));
-      else
+      } else if (Action == Backend_EmitLL) {
         MPM.addPass(PrintModulePass(*OS, "", CodeGenOpts.EmitLLVMUseLists,
                                     EmitLTOSummary));
+      }
     }
-  }
-  if (CodeGenOpts.FatLTO) {
-    // Set the EnableSplitLTOUnit and UnifiedLTO module flags, since FatLTO
-    // uses a different action than Backend_EmitBC or Backend_EmitLL.
-    if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
-      TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
-                               uint32_t(CodeGenOpts.EnableSplitLTOUnit));
-    if (CodeGenOpts.UnifiedLTO && !TheModule->getModuleFlag("UnifiedLTO"))
+
+    if (shouldEmitUnifiedLTOModueFlag())
       TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c
index afce798c5c819..b50567c024fc8 100644
--- a/clang/test/CodeGen/fat-lto-objects.c
+++ b/clang/test/CodeGen/fat-lto-objects.c
@@ -11,10 +11,11 @@
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.split.bc %t.full.split.o
 // RUN: llvm-dis %t.full.split.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED
 
+/// Full LTO always sets EnableSplitLTOUnit when the summary is used.
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -emit-obj < %s -o %t.full.nosplit.o
 // RUN: llvm-readelf -S %t.full.nosplit.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.nosplit.bc %t.full.nosplit.o
-// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,NOSPLIT,NOUNIFIED
+// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED
 
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -fsplit-lto-unit -ffat-lto-objects -emit-obj < %s -o %t.thin.split.o
 // RUN: llvm-readelf -S %t.thin.split.o | FileCheck %s --check-prefixes=ELF
@@ -34,6 +35,21 @@
 // RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -S < %s -o - \
 // RUN: | FileCheck %s --check-prefixes=ASM
 
+/// Make sure that FatLTO generates .llvm.lto sections that are the same as the output from normal LTO compilations
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -ffat-lto-objects -c %s -o %t.fatlto.full.o
+// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.full.bc %t.fatlto.full.o
+// RUN: llvm-dis < %t.fatlto.full.bc -o %t.fatlto.full.ll
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=full -c %s -o %t.nofat.full.bc
+// RUN: llvm-dis < %t.nofat.full.bc -o %t.nofat.full.ll
+// RUN: diff %t.fatlto.full.ll %t.nofat.full.ll
+
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -ffat-lto-objects -c %s -o %t.fatlto.thin.o
+// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.fatlto.thin.bc %t.fatlto.thin.o
+// RUN: llvm-dis < %t.fatlto.thin.bc -o %t.fatlto.thin.ll
+// RUN: %clang -O2 --target=x86_64-unknown-linux-gnu -fPIE -flto=thin -c %s -o %t.nofat.thin.bc
+// RUN: llvm-dis < %t.nofat.thin.bc -o %t.nofat.thin.ll
+// RUN: diff %t.fatlto.thin.ll %t.nofat.thin.ll
+
 /// Be sure we enable split LTO units correctly under -ffat-lto-objects.
 //   SPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 1}
 // NOSPLIT: ![[#]] = !{i32 1, !"EnableSplitLTOUnit", i32 0}
@@ -51,6 +67,9 @@
 // ASM-NEXT:        .asciz  "BC
 // ASM-NEXT: .size   .Lllvm.embedded.object
 
+const char* foo = "foo";
+
 int test(void) {
+  const char* bar = "bar";
   return 0xabcd;
 }

From bf45c3a07918c14577ef7a829f16ec339b9ed610 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 2 Mar 2024 12:34:36 +0000
Subject: [PATCH 08/46] [DSE] Delay deleting non-memory-defs until end of DSE.
 (#83411)

DSE uses BatchAA, which caches queries using pairs of MemoryLocations.
At the moment, DSE may remove instructions that are used as pointers in
cached MemoryLocations. If a new instruction used by a new MemoryLoation
and this instruction gets allocated at the same address as a previosuly
cached and then removed instruction, we may access an incorrect entry in
the cache.

To avoid this delay removing all instructions except MemoryDefs until
the end of DSE. This should avoid removing any values used in BatchAA's
cache.

Test case by @vporpo from
https://github.com/llvm/llvm-project/pull/83181.
(Test not precommitted because the results are non-determinstic - memset
only sometimes gets removed)

PR: https://github.com/llvm/llvm-project/pull/83411
(cherry picked from commit 10f5e983a9e3162a569cbebeb32168716e391340)
---
 .../Scalar/DeadStoreElimination.cpp           |  31 ++-
 .../batchaa-caching-new-pointers.ll           | 189 ++++++++++++++++++
 2 files changed, 215 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll

diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 11a91bfbe5baf..340fba4fb9c5a 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -857,6 +857,9 @@ struct DSEState {
   // no longer be captured.
   bool ShouldIterateEndOfFunctionDSE;
 
+  /// Dead instructions to be removed at the end of DSE.
+  SmallVector<Instruction *> ToRemove;
+
   // Class contains self-reference, make sure it's not copied/moved.
   DSEState(const DSEState &) = delete;
   DSEState &operator=(const DSEState &) = delete;
@@ -1692,7 +1695,8 @@ struct DSEState {
     return {MaybeDeadAccess};
   }
 
-  // Delete dead memory defs
+  /// Delete dead memory defs and recursively add their operands to ToRemove if
+  /// they became dead.
   void deleteDeadInstruction(Instruction *SI) {
     MemorySSAUpdater Updater(&MSSA);
     SmallVector<Instruction *, 32> NowDeadInsts;
@@ -1708,8 +1712,11 @@ struct DSEState {
       salvageKnowledge(DeadInst);
 
       // Remove the Instruction from MSSA.
-      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
-        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+      MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst);
+      bool IsMemDef = MA && isa<MemoryDef>(MA);
+      if (MA) {
+        if (IsMemDef) {
+          auto *MD = cast<MemoryDef>(MA);
           SkipStores.insert(MD);
           if (auto *SI = dyn_cast<StoreInst>(MD->getMemoryInst())) {
             if (SI->getValueOperand()->getType()->isPointerTy()) {
@@ -1730,13 +1737,21 @@ struct DSEState {
       // Remove its operands
       for (Use &O : DeadInst->operands())
         if (Instruction *OpI = dyn_cast<Instruction>(O)) {
-          O = nullptr;
+          O.set(PoisonValue::get(O->getType()));
           if (isInstructionTriviallyDead(OpI, &TLI))
             NowDeadInsts.push_back(OpI);
         }
 
       EI.removeInstruction(DeadInst);
-      DeadInst->eraseFromParent();
+      // Remove memory defs directly if they don't produce results, but only
+      // queue other dead instructions for later removal. They may have been
+      // used as memory locations that have been cached by BatchAA. Removing
+      // them here may lead to newly created instructions to be allocated at the
+      // same address, yielding stale cache entries.
+      if (IsMemDef && DeadInst->getType()->isVoidTy())
+        DeadInst->eraseFromParent();
+      else
+        ToRemove.push_back(DeadInst);
     }
   }
 
@@ -2233,6 +2248,12 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+
+  while (!State.ToRemove.empty()) {
+    Instruction *DeadInst = State.ToRemove.pop_back_val();
+    DeadInst->eraseFromParent();
+  }
+
   return MadeChange;
 }
 } // end anonymous namespace
diff --git a/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll
new file mode 100644
index 0000000000000..ee9bd6912e2ae
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/batchaa-caching-new-pointers.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=dse < %s | FileCheck %s
+;
+; DSE kills `store i32 44, ptr %struct.byte.4, align 4` but should not kill
+; `call void @llvm.memset.p0.i64(...)`  because it has a clobber read:
+; `%ret = load ptr, ptr %struct.byte.8`
+
+
+%struct.type = type { ptr, ptr }
+
+define ptr @foo(ptr noundef %ptr) {
+; CHECK-LABEL: define ptr @foo(
+; CHECK-SAME: ptr noundef [[PTR:%.*]]) {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+; Set of tests based on @foo, but where the memset's operands cannot be erased
+; due to other uses. Instead, they contain a number of removable MemoryDefs;
+; with non-void types result types.
+
+define ptr @foo_with_removable_malloc() {
+; CHECK-LABEL: define ptr @foo_with_removable_malloc() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+
+  ; Set of removable memory deffs
+  %m2 = tail call ptr @malloc(i64 4)
+  %m1 = tail call ptr @malloc(i64 4)
+  store i32 0, ptr %struct.byte.8
+  store i32 0, ptr %struct.byte.8
+  store i32 123, ptr %m1
+  store i32 123, ptr %m2
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+define ptr @foo_with_removable_malloc_free() {
+; CHECK-LABEL: define ptr @foo_with_removable_malloc_free() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    [[M1:%.*]] = tail call ptr @malloc(i64 4)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[M2:%.*]] = tail call ptr @malloc(i64 4)
+; CHECK-NEXT:    call void @free(ptr [[M1]])
+; CHECK-NEXT:    call void @free(ptr [[M2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  %m1 = tail call ptr @malloc(i64 4)
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+
+  store i32 0, ptr %struct.byte.4
+  store i32 0, ptr %struct.byte.8
+  %m2 = tail call ptr @malloc(i64 4)
+  store i32 123, ptr %m1
+  call void @free(ptr %m1);
+  store i32 123, ptr %m2
+  call void @free(ptr %m2);
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+define ptr @foo_with_malloc_to_calloc() {
+; CHECK-LABEL: define ptr @foo_with_malloc_to_calloc() {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    [[STRUCT_BYTE_4:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 4
+; CHECK-NEXT:    [[CALLOC1:%.*]] = call ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    [[CALLOC:%.*]] = call ptr @calloc(i64 1, i64 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_BYTE_8]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 [[TMP1]], i8 42, i64 4, i1 false)
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_4]])
+; CHECK-NEXT:    call void @readnone(ptr [[STRUCT_BYTE_8]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR6]]
+; CHECK-NEXT:    call void @use(ptr [[CALLOC1]])
+; CHECK-NEXT:    call void @use(ptr [[CALLOC]])
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+
+  ; Set of removable memory deffs
+  %m1 = tail call ptr @malloc(i64 4)
+  %m2 = tail call ptr @malloc(i64 4)
+  store i32 0, ptr %struct.byte.4
+  store i32 0, ptr %struct.byte.8
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m2, i8 0, i64 4, i1 false)
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %m1, i8 0, i64 4, i1 false)
+
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @readnone(ptr %struct.byte.4);
+  call void @readnone(ptr %struct.byte.8);
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  call void @use(ptr %m1)
+  call void @use(ptr %m2)
+  ret ptr %ret
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+declare noalias ptr @malloc(i64) willreturn allockind("alloc,uninitialized") "alloc-family"="malloc"
+declare void @readnone(ptr) readnone nounwind
+declare void @free(ptr nocapture) allockind("free") "alloc-family"="malloc"
+
+declare void @use(ptr)

From e90bfdb4ddced8dff215672ffeceece8ebe60426 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 6 Mar 2024 13:17:43 -0800
Subject: [PATCH 09/46] [test] Make two sanitize-coverage tests pass with glibc
 2.39+

glibc 2.39 added `nonnull` attribute to most libio functions accepting a
`FILE*` parameter, including fprintf[1]. The -fsanitize=undefined mode
checks the argument to fprintf and has extra counters, not expected by
two tests. Specify -fno-sanitize=nonnull-attribute to make the two tests
pass.

Fix #82883

[1]: https://sourceware.org/git/?p=glibc.git;a=commit;h=64b1a44183a3094672ed304532bedb9acc707554

Pull Request: https://github.com/llvm/llvm-project/pull/84231

(cherry picked from commit c3acbf6bb06f9039f9850e18e0ae2f2adef63905)
---
 .../sanitizer_coverage_inline8bit_counter_default_impl.cpp    | 4 +++-
 .../TestCases/sanitizer_coverage_symbolize.cpp                | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
index 1ac04b53491e1..1d1fbf7299e8b 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_inline8bit_counter_default_impl.cpp
@@ -3,7 +3,9 @@
 
 // REQUIRES: has_sancovcc,stable-runtime,linux,x86_64-target-arch
 
-// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -o %t
+/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute,
+/// which would increase counters for ubsan.
+// RUN: %clangxx -O0 %s -fsanitize-coverage=inline-8bit-counters,pc-table -fno-sanitize=nonnull-attribute -o %t
 // RUN: rm -f %t-counters %t-pcs
 // RUN: env %tool_options="cov_8bit_counters_out=%t-counters cov_pcs_out=%t-pcs verbosity=1" %run %t 2>&1 | FileCheck %s
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
index daa994c811625..b168954a1c92c 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/sanitizer_coverage_symbolize.cpp
@@ -7,7 +7,9 @@
 // RUN: rm -rf $DIR
 // RUN: mkdir -p $DIR
 // RUN: cd $DIR
-// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard %s -o %t
+/// In glibc 2.39+, fprintf has a nonnull attribute. Disable nonnull-attribute,
+/// which would increase counters for ubsan.
+// RUN: %clangxx -O0 -fsanitize-coverage=trace-pc-guard -fno-sanitize=nonnull-attribute %s -o %t
 // RUN: %env_tool_opts=coverage=1 %t 2>&1 | FileCheck %s
 // RUN: rm -rf $DIR
 

From 4c36ecbe0e162155e6032aec75323dcdd7c81b90 Mon Sep 17 00:00:00 2001
From: Quentin Dian <dianqk@dianqk.net>
Date: Wed, 6 Mar 2024 06:16:28 +0800
Subject: [PATCH 10/46]  [InstCombine] Fix shift calculation in
 InstCombineCasts (#84027)

Fixes #84025.

(cherry picked from commit e96c0c1d5e0a9916098b1a31acb006ea6c1108fb)
---
 .../Transforms/InstCombine/InstCombineCasts.cpp   |  4 ++--
 llvm/test/Transforms/InstCombine/bitcast.ll       | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 58f0763bb0c0c..c5d3f60176a82 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2156,14 +2156,14 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
 
     for (unsigned i = 0; i != NumElts; ++i) {
-      unsigned ShiftI = Shift + i * ElementSize;
+      unsigned ShiftI = i * ElementSize;
       Constant *Piece = ConstantFoldBinaryInstruction(
           Instruction::LShr, C, ConstantInt::get(C->getType(), ShiftI));
       if (!Piece)
         return false;
 
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+      if (!collectInsertionElements(Piece, ShiftI + Shift, Elements, VecEltTy,
                                     isBigEndian))
         return false;
     }
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 58bd81297b0dd..5ace1039c3782 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -686,6 +686,21 @@ define ptr @bitcast_from_single_element_pointer_vector_to_pointer(<1 x ptr> %ptr
   ret ptr %ptr
 }
 
+; Sure that we calculate the correct shift.
+define <4 x i32> @bitcast_shl(i32 %arg) {
+; CHECK-LABEL: @bitcast_shl(
+; CHECK-NEXT:    [[I5:%.*]] = insertelement <4 x i32> <i32 0, i32 0, i32 65, i32 poison>, i32 [[ARG:%.*]], i64 3
+; CHECK-NEXT:    ret <4 x i32> [[I5]]
+;
+  %i = zext i32 %arg to i64
+  %i1 = shl i64 %i, 32
+  %i2 = or i64 %i1, 65
+  %i3 = zext i64 %i2 to i128
+  %i4 = shl i128 %i3, 64
+  %i5 = bitcast i128 %i4 to <4 x i32>
+  ret <4 x i32> %i5
+}
+
 declare void @f1()
 declare void @f2()
 define ptr @select_bitcast_unsized_pointer(i1 %c) {

From 94d8f150ed8b6afb83ba47f68116a0247a23cb52 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 6 Mar 2024 09:33:51 +0100
Subject: [PATCH 11/46] [InstCombine] Fix infinite loop in select equivalence
 fold (#84036)

When replacing with a non-constant, it's possible that the result of the
simplification is actually more complicated than the original, and may
result in an infinite combine loop.

Mitigate the issue by requiring that either the replacement or
simplification result is constant, which should ensure that it's
simpler. While this check is crude, it does not appear to cause
optimization regressions in real-world code in practice.

Fixes https://github.com/llvm/llvm-project/issues/83127.

(cherry picked from commit 9f45c5e1a65a1abf4920b617d36ed05e73c04bea)
---
 .../InstCombine/InstCombineSelect.cpp         |  9 ++++-
 llvm/test/Transforms/InstCombine/select.ll    | 38 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 21bfc91148bfe..9f220ec003ec3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1284,7 +1284,11 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
       isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) {
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ,
                                           /* AllowRefinement */ true))
-      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+      // Require either the replacement or the simplification result to be a
+      // constant to avoid infinite loops.
+      // FIXME: Make this check more precise.
+      if (isa<Constant>(CmpRHS) || isa<Constant>(V))
+        return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
     // Even if TrueVal does not simplify, we can directly replace a use of
     // CmpLHS with CmpRHS, as long as the instruction is not used anywhere
@@ -1302,7 +1306,8 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
       isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT))
     if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ,
                                           /* AllowRefinement */ true))
-      return replaceOperand(Sel, Swapped ? 2 : 1, V);
+      if (isa<Constant>(CmpLHS) || isa<Constant>(V))
+        return replaceOperand(Sel, Swapped ? 2 : 1, V);
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d740..b7e743c14a52c 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2849,12 +2849,14 @@ define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) {
   ret i8 %sel
 }
 
+; FIXME: This is safe to fold.
 define i8 @select_replacement_shift_noundef(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @select_replacement_shift_noundef(
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 [[X:%.*]], 1
 ; CHECK-NEXT:    call void @use_i8(i8 noundef [[SHR]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[SHR]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[X]], i8 [[Z:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[Y]], 1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[SHL]], i8 [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[SEL]]
 ;
   %shr = lshr exact i8 %x, 1
@@ -2904,6 +2906,40 @@ define i32 @select_replacement_loop2(i32 %arg, i32 %arg2) {
   ret i32 %sel
 }
 
+define i8 @select_replacement_loop3(i32 noundef %x) {
+; CHECK-LABEL: @select_replacement_loop3(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[X:%.*]] to i8
+; CHECK-NEXT:    [[REV:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[TRUNC]])
+; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[REV]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[EXT]], [[X]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i8 [[TRUNC]], i8 0
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %trunc = trunc i32 %x to i8
+  %rev = call i8 @llvm.bitreverse.i8(i8 %trunc)
+  %ext = zext i8 %rev to i32
+  %cmp = icmp eq i32 %ext, %x
+  %sel = select i1 %cmp, i8 %trunc, i8 0
+  ret i8 %sel
+}
+
+define i16 @select_replacement_loop4(i16 noundef %p_12) {
+; CHECK-LABEL: @select_replacement_loop4(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[P_12:%.*]], 2
+; CHECK-NEXT:    [[AND1:%.*]] = and i16 [[P_12]], 1
+; CHECK-NEXT:    [[AND2:%.*]] = select i1 [[CMP1]], i16 [[AND1]], i16 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i16 [[AND2]], [[P_12]]
+; CHECK-NEXT:    [[AND3:%.*]] = select i1 [[CMP2]], i16 [[AND1]], i16 0
+; CHECK-NEXT:    ret i16 [[AND3]]
+;
+  %cmp1 = icmp ult i16 %p_12, 2
+  %and1 = and i16 %p_12, 1
+  %and2 = select i1 %cmp1, i16 %and1, i16 0
+  %cmp2 = icmp eq i16 %and2, %p_12
+  %and3 = select i1 %cmp2, i16 %and1, i16 0
+  ret i16 %and3
+}
+
 define ptr @select_replacement_gep_inbounds(ptr %base, i64 %offset) {
 ; CHECK-LABEL: @select_replacement_gep_inbounds(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]]

From 7cb67530d2e9b7364e81f0d35da99a2855e391ac Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 11 Mar 2024 13:40:01 -0700
Subject: [PATCH 12/46] ReleaseNotes for LLVM binary utilities (#83751)

---
 llvm/docs/ReleaseNotes.rst | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 5b3210138f2f8..bfa8e93da05cb 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -339,36 +339,41 @@ Changes to the Debug Info
 Changes to the LLVM tools
 ---------------------------------
 
-* llvm-symbolizer now treats invalid input as an address for which source
+* ``llvm-symbolizer`` now treats invalid input as an address for which source
   information is not found.
-* Fixed big-endian support in llvm-symbolizer's DWARF location parser.
-* llvm-readelf now supports ``--extra-sym-info`` (``-X``) to display extra
+* Fixed big-endian support in ``llvm-symbolizer``'s DWARF location parser.
+* ``llvm-readelf`` now supports ``--extra-sym-info`` (``-X``) to display extra
   information (section name) when showing symbols.
+* ``llvm-readobj``/``llvm-readelf`` now supports ``--decompress``/``-z`` with
+  string and hex dump for ELF object files.
 
-* ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use
-  debugging information to print symbols' filenames and line numbers.
-
-* llvm-symbolizer and llvm-addr2line now support addresses specified as symbol names.
+* ``llvm-symbolizer`` and ``llvm-addr2line`` now support addresses specified as symbol names.
 
-* llvm-objcopy now supports ``--gap-fill`` and ``--pad-to`` options, for
+* ``llvm-objcopy`` now supports ``--gap-fill`` and ``--pad-to`` options, for
   ELF input and binary output files only.
+* ``llvm-objcopy`` now supports ``-O elf64-s390`` for SystemZ.
 
-* Supported parsing XCOFF auxiliary symbols in obj2yaml.
+* Supported parsing XCOFF auxiliary symbols in ``obj2yaml``.
 
 * ``llvm-ranlib`` now supports ``-X`` on AIX to specify the type of object file
   ranlib should examine.
 
+* ``llvm-cxxfilt`` now supports ``--no-params``/``-p`` to skip function
+  parameters.
+
 * ``llvm-nm`` now supports ``--export-symbol`` to ignore the import symbol file.
+* ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use
+  debugging information to print symbols' filenames and line numbers.
 
-* llvm-rc and llvm-windres now accept file path references in ``.rc`` files
+* ``llvm-rc`` and ``llvm-windres`` now accept file path references in ``.rc`` files
   concatenated from multiple string literals.
 
-* The llvm-windres option ``--preprocessor`` now resolves its argument
-  in the PATH environment variable as expected, and options passed with
+* The ``llvm-windres`` option ``--preprocessor`` now resolves its argument
+  in the ``PATH`` environment variable as expected, and options passed with
   ``--preprocessor-arg`` are placed before the input file as they should
   be.
 
-* The llvm-windres option ``--preprocessor`` has been updated with the
+* The ``llvm-windres`` option ``--preprocessor`` has been updated with the
   breaking behaviour change from GNU windres from binutils 2.36, where
   the whole argument is considered as one path, not considered as a
   sequence of tool name and parameters.

From a91b9bd9c7507b378a7f318db52484c8bacc12eb Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <vadim.paretsky@intel.com>
Date: Sat, 9 Mar 2024 10:47:31 -0800
Subject: [PATCH 13/46] [OpenMP] fix endianness dependent definitions in OMP
 headers for MSVC (#84540)

MSVC does not define __BYTE_ORDER__ making the check for BigEndian
erroneously evaluate to true and breaking the struct definitions in MSVC
compiled builds correspondingly. The fix adds an additional check for
whether __BYTE_ORDER__ is defined by the compiler to fix these.

---------

Co-authored-by: Vadim Paretsky <b-vadipa@microsoft.com>
(cherry picked from commit 110141b37813dc48af33de5e1407231e56acdfc5)
---
 openmp/runtime/src/kmp.h                                 | 4 ++--
 openmp/runtime/src/kmp_lock.h                            | 3 ++-
 openmp/runtime/test/tasking/bug_nested_proxy_task.c      | 2 +-
 openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c | 2 +-
 openmp/runtime/test/tasking/hidden_helper_task/common.h  | 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 259c57b5afbca..e3a1e20731bbe 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2506,7 +2506,7 @@ typedef struct kmp_depend_info {
   union {
     kmp_uint8 flag; // flag as an unsigned char
     struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       /* Same fields as in the #else branch, but in reverse order */
       unsigned all : 1;
       unsigned unused : 3;
@@ -2671,7 +2671,7 @@ typedef struct kmp_task_stack {
 #endif // BUILD_TIED_TASK_STACK
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   /* Same fields as in the #else branch, but in reverse order */
 #if OMPX_TASKGRAPH
   unsigned reserved31 : 6;
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index e2a0cda01a971..6202f3d617cc5 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -120,7 +120,8 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ && __LP64__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&     \
+    __LP64__
   // Flip the ordering of the high and low 32-bit member to be consistent
   // with the memory layout of the address in 64-bit big-endian.
   kmp_int32 depth_locked; // depth locked, for nested locks only
diff --git a/openmp/runtime/test/tasking/bug_nested_proxy_task.c b/openmp/runtime/test/tasking/bug_nested_proxy_task.c
index 24fe1f3fe7607..9e0b412efce60 100644
--- a/openmp/runtime/test/tasking/bug_nested_proxy_task.c
+++ b/openmp/runtime/test/tasking/bug_nested_proxy_task.c
@@ -50,7 +50,7 @@ typedef struct kmp_depend_info {
      union {
         kmp_uint8 flag; // flag as an unsigned char
         struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
           unsigned all : 1;
           unsigned unused : 3;
           unsigned set : 1;
diff --git a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
index 688860c035728..1e86d574f4f6a 100644
--- a/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
+++ b/openmp/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -47,7 +47,7 @@ typedef struct kmp_depend_info {
      union {
         kmp_uint8 flag; // flag as an unsigned char
         struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
           unsigned all : 1;
           unsigned unused : 3;
           unsigned set : 1;
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/common.h b/openmp/runtime/test/tasking/hidden_helper_task/common.h
index ba57656cbac41..68e2b584c8773 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/common.h
+++ b/openmp/runtime/test/tasking/hidden_helper_task/common.h
@@ -17,7 +17,7 @@ typedef struct kmp_depend_info {
   union {
     unsigned char flag;
     struct {
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       unsigned all : 1;
       unsigned unused : 3;
       unsigned set : 1;

From 69d9b15fe872bb188474b0ad9e36c8506bdf9cc3 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Tue, 5 Mar 2024 19:54:02 +0800
Subject: [PATCH 14/46] [TableGen] Fix wrong codegen of
 BothFusionPredicateWithMCInstPredicate (#83990)

We should generate the `MCInstPredicate` twice, one with `FirstMI`
and another with `SecondMI`.

(cherry picked from commit de1f33873beff93063577195e1214a9509e229e0)
---
 llvm/include/llvm/Target/TargetSchedule.td    |  2 +-
 llvm/test/TableGen/MacroFusion.td             | 25 +++++++++++++++++++
 .../TableGen/MacroFusionPredicatorEmitter.cpp | 12 ++++-----
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 032de72851782..40c2cce8c6eff 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -620,7 +620,7 @@ class SecondFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
   : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
 // The pred will be applied on both firstMI and secondMI.
 class BothFusionPredicateWithMCInstPredicate<MCInstPredicate pred>
-  : FusionPredicateWithMCInstPredicate<second_fusion_target, pred>;
+  : FusionPredicateWithMCInstPredicate<both_fusion_target, pred>;
 
 // Tie firstOpIdx and secondOpIdx. The operand of `FirstMI` at position
 // `firstOpIdx` should be the same as the operand of `SecondMI` at position
diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td
index 4aa6c8d9acb27..ce76e7f0f7fa6 100644
--- a/llvm/test/TableGen/MacroFusion.td
+++ b/llvm/test/TableGen/MacroFusion.td
@@ -34,6 +34,11 @@ let Namespace = "Test" in {
 def Inst0 : TestInst<0>;
 def Inst1 : TestInst<1>;
 
+def BothFusionPredicate: BothFusionPredicateWithMCInstPredicate<CheckRegOperand<0, X0>>;
+def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusionPredicate",
+                                    "Test BothFusionPredicate",
+                                    [BothFusionPredicate]>;
+
 def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
                              CheckOpcode<[Inst0]>,
                              CheckAll<[
@@ -45,6 +50,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_DECL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
+// CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(const TargetInstrInfo &, const TargetSubtargetInfo &, const MachineInstr *, const MachineInstr &);
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
@@ -54,6 +60,24 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 // CHECK-PREDICATOR-NEXT:  #undef GET_Test_MACRO_FUSION_PRED_IMPL
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  namespace llvm {
+// CHECK-PREDICATOR-NEXT:  bool isTestBothFusionPredicate(
+// CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
+// CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *FirstMI,
+// CHECK-PREDICATOR-NEXT:      const MachineInstr &SecondMI) {
+// CHECK-PREDICATOR-NEXT:    auto &MRI = SecondMI.getMF()->getRegInfo();
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = FirstMI;
+// CHECK-PREDICATOR-NEXT:      if (MI->getOperand(0).getReg() != Test::X0)
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    {
+// CHECK-PREDICATOR-NEXT:      const MachineInstr *MI = &SecondMI;
+// CHECK-PREDICATOR-NEXT:      if (MI->getOperand(0).getReg() != Test::X0)
+// CHECK-PREDICATOR-NEXT:        return false;
+// CHECK-PREDICATOR-NEXT:    }
+// CHECK-PREDICATOR-NEXT:    return true;
+// CHECK-PREDICATOR-NEXT:  }
 // CHECK-PREDICATOR-NEXT:  bool isTestFusion(
 // CHECK-PREDICATOR-NEXT:      const TargetInstrInfo &TII,
 // CHECK-PREDICATOR-NEXT:      const TargetSubtargetInfo &STI,
@@ -106,6 +130,7 @@ def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
 
 // CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> TestGenSubtargetInfo::getMacroFusions() const {
 // CHECK-SUBTARGET-NEXT:   std::vector<MacroFusionPredTy> Fusions;
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestBothFusionPredicate)) Fusions.push_back(llvm::isTestBothFusionPredicate); 
 // CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion);
 // CHECK-SUBTARGET-NEXT:   return Fusions;
 // CHECK-SUBTARGET-NEXT: }
diff --git a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
index 78dcd4471ae74..7f494e532b1f4 100644
--- a/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
+++ b/llvm/utils/TableGen/MacroFusionPredicatorEmitter.cpp
@@ -152,8 +152,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
         << "if (FirstDest.isVirtual() && !MRI.hasOneNonDBGUse(FirstDest))\n";
     OS.indent(4) << "  return false;\n";
     OS.indent(2) << "}\n";
-  } else if (Predicate->isSubClassOf(
-                 "FirstFusionPredicateWithMCInstPredicate")) {
+  } else if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
     OS.indent(2) << "{\n";
     OS.indent(4) << "const MachineInstr *MI = FirstMI;\n";
     OS.indent(4) << "if (";
@@ -173,7 +172,7 @@ void MacroFusionPredicatorEmitter::emitFirstPredicate(Record *Predicate,
 void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
                                                        PredicateExpander &PE,
                                                        raw_ostream &OS) {
-  if (Predicate->isSubClassOf("SecondFusionPredicateWithMCInstPredicate")) {
+  if (Predicate->isSubClassOf("FusionPredicateWithMCInstPredicate")) {
     OS.indent(2) << "{\n";
     OS.indent(4) << "const MachineInstr *MI = &SecondMI;\n";
     OS.indent(4) << "if (";
@@ -185,7 +184,7 @@ void MacroFusionPredicatorEmitter::emitSecondPredicate(Record *Predicate,
     OS.indent(2) << "}\n";
   } else {
     PrintFatalError(Predicate->getLoc(),
-                    "Unsupported predicate for first instruction: " +
+                    "Unsupported predicate for second instruction: " +
                         Predicate->getType()->getAsString());
   }
 }
@@ -196,9 +195,8 @@ void MacroFusionPredicatorEmitter::emitBothPredicate(Record *Predicate,
   if (Predicate->isSubClassOf("FusionPredicateWithCode"))
     OS << Predicate->getValueAsString("Predicate");
   else if (Predicate->isSubClassOf("BothFusionPredicateWithMCInstPredicate")) {
-    Record *MCPred = Predicate->getValueAsDef("Predicate");
-    emitFirstPredicate(MCPred, PE, OS);
-    emitSecondPredicate(MCPred, PE, OS);
+    emitFirstPredicate(Predicate, PE, OS);
+    emitSecondPredicate(Predicate, PE, OS);
   } else if (Predicate->isSubClassOf("TieReg")) {
     int FirstOpIdx = Predicate->getValueAsInt("FirstOpIdx");
     int SecondOpIdx = Predicate->getValueAsInt("SecondOpIdx");

From ea6c457b8dd2d0e6a7f05b4a5bdd2686085e1ec0 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Mon, 4 Mar 2024 08:38:52 +0800
Subject: [PATCH 15/46] [LoongArch] Override
 LoongArchTargetLowering::getExtendForAtomicCmpSwapArg (#83656)

This patch aims to solve Firefox issue:
https://bugzilla.mozilla.org/show_bug.cgi?id=1882301

Similar to 616289ed2922. Currently LoongArch uses an ll.[wd]/sc.[wd]
loop for ATOMIC_CMP_XCHG. Because the comparison in the loop is
full-width (i.e. the `bne` instruction), we must sign extend the input
comparsion argument.

Note that LoongArch ISA manual V1.1 has introduced compare-and-swap
instructions. We would change the implementation (return `ANY_EXTEND`)
when we support them.

(cherry picked from commit 5f058aa211995d2f0df2a0e063532832569cb7a8)
---
 .../LoongArch/LoongArchISelLowering.cpp       |   5 +
 .../Target/LoongArch/LoongArchISelLowering.h  |   2 +
 .../LoongArch/atomicrmw-uinc-udec-wrap.ll     | 120 +++++++------
 .../ir-instruction/atomic-cmpxchg.ll          |  25 +--
 .../LoongArch/ir-instruction/atomicrmw-fp.ll  | 160 +++++++++---------
 5 files changed, 159 insertions(+), 153 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 76c1a14fe0156..b161c5434ca13 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -4940,3 +4940,8 @@ bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const {
 
   return !isa<ConstantSDNode>(Y);
 }
+
+ISD::NodeType LoongArchTargetLowering::getExtendForAtomicCmpSwapArg() const {
+  // TODO: LAMCAS will use amcas{_DB,}.[bhwd] which does not require extension.
+  return ISD::SIGN_EXTEND;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 72182623b2c3d..9e9ac0b826929 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -206,6 +206,8 @@ class LoongArchTargetLowering : public TargetLowering {
     return ISD::SIGN_EXTEND;
   }
 
+  ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
+
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index b0f29ee790885..b84c1093eb75f 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -25,15 +25,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a5, $a5, 255
 ; LA64-NEXT:    sll.w $a5, $a5, $a3
 ; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a6, $a6, $a5
+; LA64-NEXT:    or $a5, $a6, $a5
+; LA64-NEXT:    addi.w $a6, $a2, 0
 ; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    bne $a5, $a2, .LBB0_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a6, .LBB0_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    move $a7, $a5
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB0_3
 ; LA64-NEXT:    b .LBB0_6
@@ -42,11 +43,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT:    addi.w $a6, $a2, 0
-; LA64-NEXT:    move $a2, $a5
-; LA64-NEXT:    bne $a5, $a6, .LBB0_1
+; LA64-NEXT:    bne $a2, $a6, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a5, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -77,15 +76,16 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
 ; LA64-NEXT:    sll.w $a5, $a5, $a3
 ; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a6, $a6, $a5
+; LA64-NEXT:    or $a5, $a6, $a5
+; LA64-NEXT:    addi.w $a6, $a2, 0
 ; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a5, $a0, 0
-; LA64-NEXT:    bne $a5, $a2, .LBB1_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a6, .LBB1_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
+; LA64-NEXT:    move $a7, $a5
 ; LA64-NEXT:    sc.w $a7, $a0, 0
 ; LA64-NEXT:    beqz $a7, .LBB1_3
 ; LA64-NEXT:    b .LBB1_6
@@ -94,11 +94,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT:    addi.w $a6, $a2, 0
-; LA64-NEXT:    move $a2, $a5
-; LA64-NEXT:    bne $a5, $a6, .LBB1_1
+; LA64-NEXT:    bne $a2, $a6, .LBB1_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a5, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -107,37 +105,36 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ld.w $a3, $a0, 0
-; LA64-NEXT:    addi.w $a2, $a1, 0
+; LA64-NEXT:    ld.w $a2, $a0, 0
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
-; LA64-NEXT:    addi.w $a4, $a3, 0
-; LA64-NEXT:    sltu $a1, $a4, $a2
-; LA64-NEXT:    xori $a1, $a1, 1
-; LA64-NEXT:    addi.d $a5, $a3, 1
-; LA64-NEXT:    masknez $a5, $a5, $a1
+; LA64-NEXT:    addi.w $a3, $a2, 0
+; LA64-NEXT:    sltu $a4, $a3, $a1
+; LA64-NEXT:    xori $a4, $a4, 1
+; LA64-NEXT:    addi.d $a2, $a2, 1
+; LA64-NEXT:    masknez $a4, $a2, $a4
 ; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a1, $a0, 0
-; LA64-NEXT:    bne $a1, $a3, .LBB2_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a3, .LBB2_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
-; LA64-NEXT:    move $a6, $a5
-; LA64-NEXT:    sc.w $a6, $a0, 0
-; LA64-NEXT:    beqz $a6, .LBB2_3
+; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    sc.w $a5, $a0, 0
+; LA64-NEXT:    beqz $a5, .LBB2_3
 ; LA64-NEXT:    b .LBB2_6
 ; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT:    move $a3, $a1
-; LA64-NEXT:    bne $a1, $a4, .LBB2_1
+; LA64-NEXT:    bne $a2, $a3, .LBB2_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    move $a0, $a1
+; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst
   ret i32 %result
@@ -209,15 +206,16 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    andi $a6, $a6, 255
 ; LA64-NEXT:    sll.w $a6, $a6, $a3
 ; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    addi.w $a7, $a2, 0
 ; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a6, $a0, 0
-; LA64-NEXT:    bne $a6, $a2, .LBB4_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a7, .LBB4_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
-; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    move $t0, $a6
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB4_3
 ; LA64-NEXT:    b .LBB4_6
@@ -226,11 +224,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT:    addi.w $a7, $a2, 0
-; LA64-NEXT:    move $a2, $a6
-; LA64-NEXT:    bne $a6, $a7, .LBB4_1
+; LA64-NEXT:    bne $a2, $a7, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a6, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -266,15 +262,16 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
 ; LA64-NEXT:    sll.w $a6, $a6, $a3
 ; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    addi.w $a7, $a2, 0
 ; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a6, $a0, 0
-; LA64-NEXT:    bne $a6, $a2, .LBB5_5
+; LA64-NEXT:    ll.w $a2, $a0, 0
+; LA64-NEXT:    bne $a2, $a7, .LBB5_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
-; LA64-NEXT:    move $t0, $a7
+; LA64-NEXT:    move $t0, $a6
 ; LA64-NEXT:    sc.w $t0, $a0, 0
 ; LA64-NEXT:    beqz $t0, .LBB5_3
 ; LA64-NEXT:    b .LBB5_6
@@ -283,11 +280,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT:    addi.w $a7, $a2, 0
-; LA64-NEXT:    move $a2, $a6
-; LA64-NEXT:    bne $a6, $a7, .LBB5_1
+; LA64-NEXT:    bne $a2, $a7, .LBB5_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a6, $a3
+; LA64-NEXT:    srl.w $a0, $a2, $a3
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -296,22 +291,22 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ld.w $a4, $a0, 0
+; LA64-NEXT:    ld.w $a2, $a0, 0
 ; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
-; LA64-NEXT:    addi.w $a5, $a4, 0
-; LA64-NEXT:    sltu $a2, $a3, $a5
-; LA64-NEXT:    addi.d $a6, $a4, -1
-; LA64-NEXT:    masknez $a6, $a6, $a2
-; LA64-NEXT:    maskeqz $a2, $a1, $a2
-; LA64-NEXT:    or $a2, $a2, $a6
-; LA64-NEXT:    sltui $a6, $a5, 1
-; LA64-NEXT:    masknez $a2, $a2, $a6
-; LA64-NEXT:    maskeqz $a6, $a1, $a6
-; LA64-NEXT:    or $a6, $a6, $a2
+; LA64-NEXT:    addi.w $a4, $a2, 0
+; LA64-NEXT:    sltu $a5, $a3, $a4
+; LA64-NEXT:    addi.d $a2, $a2, -1
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a2, $a5, $a2
+; LA64-NEXT:    sltui $a5, $a4, 1
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a5, $a5, $a2
 ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
@@ -319,17 +314,16 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    bne $a2, $a4, .LBB6_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
-; LA64-NEXT:    move $a7, $a6
-; LA64-NEXT:    sc.w $a7, $a0, 0
-; LA64-NEXT:    beqz $a7, .LBB6_3
+; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    sc.w $a6, $a0, 0
+; LA64-NEXT:    beqz $a6, .LBB6_3
 ; LA64-NEXT:    b .LBB6_6
 ; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT:    move $a4, $a2
-; LA64-NEXT:    bne $a2, $a5, .LBB6_1
+; LA64-NEXT:    bne $a2, $a4, .LBB6_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
index 417c865f6383f..31ecec6ea8051 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
@@ -69,6 +69,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB2_3
@@ -172,6 +173,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_monotonic:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB6_3
@@ -279,9 +281,10 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
 define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti32:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a1, .LBB10_3
+; LA64-NEXT:    ll.w $a1, $a0, 0
+; LA64-NEXT:    bne $a1, $a3, .LBB10_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; LA64-NEXT:    move $a4, $a2
 ; LA64-NEXT:    sc.w $a4, $a0, 0
@@ -290,7 +293,7 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
 ; LA64-NEXT:  .LBB10_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB10_4:
-; LA64-NEXT:    move $a0, $a3
+; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
   %res = extractvalue { i32, i1 } %tmp, 0
@@ -396,6 +399,7 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
 define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti1:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB14_3
@@ -407,8 +411,7 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
 ; LA64-NEXT:  .LBB14_3:
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB14_4:
-; LA64-NEXT:    addi.w $a0, $a1, 0
-; LA64-NEXT:    xor $a0, $a3, $a0
+; LA64-NEXT:    xor $a0, $a3, $a1
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
@@ -506,6 +509,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB18_3
@@ -613,9 +617,10 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
 define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a3, $a1, 0
 ; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ll.w $a3, $a0, 0
-; LA64-NEXT:    bne $a3, $a1, .LBB22_3
+; LA64-NEXT:    ll.w $a1, $a0, 0
+; LA64-NEXT:    bne $a1, $a3, .LBB22_3
 ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
 ; LA64-NEXT:    move $a4, $a2
 ; LA64-NEXT:    sc.w $a4, $a0, 0
@@ -624,7 +629,7 @@ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val)
 ; LA64-NEXT:  .LBB22_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB22_4:
-; LA64-NEXT:    move $a0, $a3
+; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   %res = extractvalue { i32, i1 } %tmp, 0
@@ -730,6 +735,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
 define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
 ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1:
 ; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
 ; LA64-NEXT:    ll.w $a3, $a0, 0
 ; LA64-NEXT:    bne $a3, $a1, .LBB26_3
@@ -741,8 +747,7 @@ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) n
 ; LA64-NEXT:  .LBB26_3:
 ; LA64-NEXT:    dbar 1792
 ; LA64-NEXT:  .LBB26_4:
-; LA64-NEXT:    addi.w $a0, $a1, 0
-; LA64-NEXT:    xor $a0, $a3, $a0
+; LA64-NEXT:    xor $a0, $a3, $a1
 ; LA64-NEXT:    sltui $a0, $a0, 1
 ; LA64-NEXT:    ret
   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 589360823b148..4d8160d708034 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -16,6 +16,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -33,8 +34,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB0_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB0_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -51,6 +51,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -68,8 +69,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB0_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB0_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 acquire, align 4
@@ -90,6 +90,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -107,8 +108,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB1_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB1_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -125,6 +125,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -142,8 +143,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB1_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB1_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 acquire, align 4
@@ -165,6 +165,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -182,8 +183,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB2_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB2_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -201,6 +201,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -218,8 +219,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB2_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB2_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 acquire, align 4
@@ -241,6 +241,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -258,8 +259,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB3_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB3_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB3_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -277,6 +277,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -294,8 +295,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB3_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB3_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB3_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 acquire, align 4
@@ -694,6 +694,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -711,8 +712,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB8_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB8_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB8_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB8_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -729,6 +729,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB8_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB8_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -746,8 +747,7 @@ define float @float_fadd_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB8_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB8_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB8_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB8_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 release, align 4
@@ -768,6 +768,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -785,8 +786,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB9_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB9_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB9_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB9_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -803,6 +803,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB9_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB9_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -820,8 +821,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB9_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB9_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB9_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB9_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 release, align 4
@@ -843,6 +843,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -860,8 +861,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB10_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB10_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB10_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB10_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -879,6 +879,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB10_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB10_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -896,8 +897,7 @@ define float @float_fmin_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB10_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB10_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB10_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB10_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 release, align 4
@@ -919,6 +919,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -936,8 +937,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB11_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB11_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB11_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB11_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -955,6 +955,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB11_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB11_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -972,8 +973,7 @@ define float @float_fmax_release(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB11_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB11_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB11_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB11_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 release, align 4
@@ -1372,6 +1372,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1389,8 +1390,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB16_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB16_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB16_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1407,6 +1407,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB16_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB16_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1424,8 +1425,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB16_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB16_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB16_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 acq_rel, align 4
@@ -1446,6 +1446,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1463,8 +1464,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB17_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB17_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB17_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1481,6 +1481,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB17_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB17_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1498,8 +1499,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB17_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB17_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB17_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 acq_rel, align 4
@@ -1521,6 +1521,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1538,8 +1539,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB18_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB18_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB18_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1557,6 +1557,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB18_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB18_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1574,8 +1575,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB18_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB18_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB18_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 acq_rel, align 4
@@ -1597,6 +1597,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1614,8 +1615,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB19_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB19_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB19_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -1633,6 +1633,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB19_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB19_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -1650,8 +1651,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB19_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB19_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB19_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 acq_rel, align 4
@@ -2074,6 +2074,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2091,8 +2092,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB24_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB24_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB24_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2109,6 +2109,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB24_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB24_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2126,8 +2127,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB24_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB24_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB24_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 seq_cst, align 4
@@ -2148,6 +2148,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2165,8 +2166,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB25_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB25_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB25_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2183,6 +2183,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB25_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB25_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2200,8 +2201,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB25_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB25_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB25_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 seq_cst, align 4
@@ -2223,6 +2223,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2240,8 +2241,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB26_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB26_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB26_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2259,6 +2259,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB26_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB26_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2276,8 +2277,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB26_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB26_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB26_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 seq_cst, align 4
@@ -2299,6 +2299,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2316,8 +2317,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB27_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB27_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB27_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2335,6 +2335,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB27_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB27_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2352,8 +2353,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB27_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB27_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB27_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 seq_cst, align 4
@@ -2752,6 +2752,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2769,8 +2770,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB32_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB32_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB32_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB32_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2787,6 +2787,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB32_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB32_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2804,8 +2805,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB32_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB32_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB32_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB32_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fadd ptr %p, float 1.0 monotonic, align 4
@@ -2826,6 +2826,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2843,8 +2844,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB33_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB33_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB33_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB33_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2861,6 +2861,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB33_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB33_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2878,8 +2879,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB33_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB33_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB33_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB33_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fsub ptr %p, float 1.0 monotonic, align 4
@@ -2901,6 +2901,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2918,8 +2919,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB34_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB34_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB34_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB34_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -2937,6 +2937,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB34_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB34_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2954,8 +2955,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB34_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB34_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB34_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB34_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmin ptr %p, float 1.0 monotonic, align 4
@@ -2977,6 +2977,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
+; LA64F-NEXT:    addi.w $a2, $a2, 0
 ; LA64F-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64F-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
@@ -2994,8 +2995,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64F-NEXT:  .LBB35_6: # %atomicrmw.start
 ; LA64F-NEXT:    # in Loop: Header=BB35_1 Depth=1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
-; LA64F-NEXT:    addi.w $a1, $a2, 0
-; LA64F-NEXT:    bne $a3, $a1, .LBB35_1
+; LA64F-NEXT:    bne $a3, $a2, .LBB35_1
 ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64F-NEXT:    ret
 ;
@@ -3013,6 +3013,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
 ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
 ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
+; LA64D-NEXT:    addi.w $a2, $a2, 0
 ; LA64D-NEXT:  .LBB35_3: # %atomicrmw.start
 ; LA64D-NEXT:    # Parent Loop BB35_1 Depth=1
 ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
@@ -3030,8 +3031,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:  .LBB35_6: # %atomicrmw.start
 ; LA64D-NEXT:    # in Loop: Header=BB35_1 Depth=1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
-; LA64D-NEXT:    addi.w $a1, $a2, 0
-; LA64D-NEXT:    bne $a3, $a1, .LBB35_1
+; LA64D-NEXT:    bne $a3, $a2, .LBB35_1
 ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64D-NEXT:    ret
   %v = atomicrmw fmax ptr %p, float 1.0 monotonic, align 4

From d7a9810f9c14e6598265ab41519be9b861228450 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Wed, 31 Jan 2024 02:32:15 +0000
Subject: [PATCH 16/46] [AArch64] Fix variadic tail-calls on ARM64EC (#79774)

ARM64EC varargs calls expect that x4 = sp at entry, special handling is
needed to ensure this with tail calls since they occur after the
epilogue and the x4 write happens before.

I tried going through AArch64MachineFrameLowering for this, hoping to
avoid creating the dummy object but this was the best I could do since
the stack info that uses isn't populated at this stage,
CreateFixedObject also explicitly forbids 0 sized objects.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 10 ++++-
 llvm/test/CodeGen/AArch64/arm64ec-varargs.ll  | 37 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/vararg-tallcall.ll  |  8 ++++
 3 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0287856560e91..196aa50cf4060 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8007,11 +8007,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (IsVarArg && Subtarget->isWindowsArm64EC()) {
+    SDValue ParamPtr = StackPtr;
+    if (IsTailCall) {
+      // Create a dummy object at the top of the stack that can be used to get
+      // the SP after the epilogue
+      int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
+      ParamPtr = DAG.getFrameIndex(FI, PtrVT);
+    }
+
     // For vararg calls, the Arm64EC ABI requires values in x4 and x5
     // describing the argument list.  x4 contains the address of the
     // first stack parameter. x5 contains the size in bytes of all parameters
     // passed on the stack.
-    RegsToPass.emplace_back(AArch64::X4, StackPtr);
+    RegsToPass.emplace_back(AArch64::X4, ParamPtr);
     RegsToPass.emplace_back(AArch64::X5,
                             DAG.getConstant(NumBytes, DL, MVT::i64));
   }
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
index dc16b3a1a0f27..844fc52ddade6 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
@@ -100,5 +100,42 @@ define void @varargs_many_argscalleer() nounwind {
   ret void
 }
 
+define void @varargs_caller_tail() nounwind {
+; CHECK-LABEL: varargs_caller_tail:
+; CHECK:        // %bb.0:
+; CHECK-NEXT:        sub     sp, sp, #48
+; CHECK-NEXT:        mov     x4, sp
+; CHECK-NEXT:        add     x8, sp, #16
+; CHECK-NEXT:        mov     x9, #4617315517961601024        // =0x4014000000000000
+; CHECK-NEXT:        mov     x0, #4607182418800017408        // =0x3ff0000000000000
+; CHECK-NEXT:        mov     w1, #2                          // =0x2
+; CHECK-NEXT:        mov     x2, #4613937818241073152        // =0x4008000000000000
+; CHECK-NEXT:        mov     w3, #4                          // =0x4
+; CHECK-NEXT:        mov     w5, #16                         // =0x10
+; CHECK-NEXT:        stp     xzr, x30, [sp, #24]             // 8-byte Folded Spill
+; CHECK-NEXT:        stp     x9, x8, [sp]
+; CHECK-NEXT:        str     xzr, [sp, #16]
+; CHECK-NEXT:        .weak_anti_dep  varargs_callee
+; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF
+; CHECK-NEXT:        .weak_anti_dep  "#varargs_callee"
+; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF
+; CHECK-NEXT:        bl      "#varargs_callee"
+; CHECK-NEXT:        ldr     x30, [sp, #32]                  // 8-byte Folded Reload
+; CHECK-NEXT:        add     x4, sp, #48
+; CHECK-NEXT:        mov     x0, #4607182418800017408        // =0x3ff0000000000000
+; CHECK-NEXT:        mov     w1, #4                          // =0x4
+; CHECK-NEXT:        mov     w2, #3                          // =0x3
+; CHECK-NEXT:        mov     w3, #2                          // =0x2
+; CHECK-NEXT:        mov     x5, xzr
+; CHECK-NEXT:        add     sp, sp, #48
+; CHECK-NEXT:        .weak_anti_dep  varargs_callee
+; CHECK-NEXT:.set varargs_callee, "#varargs_callee"@WEAKREF
+; CHECK-NEXT:        .weak_anti_dep  "#varargs_callee"
+; CHECK-NEXT:.set "#varargs_callee", varargs_callee@WEAKREF
+; CHECK-NEXT:        b       "#varargs_callee"
+  call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> <double 0.0, double 0.0>)
+  tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2)
+  ret void
+}
 
 declare void @llvm.va_start(ptr)
diff --git a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
index 2d6db1642247d..812837639196e 100644
--- a/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
+++ b/llvm/test/CodeGen/AArch64/vararg-tallcall.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64ec-windows-msvc %s -o - | FileCheck %s --check-prefixes=CHECK-EC
 ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
 ; RUN: llc -global-isel -global-isel-abort=2 -verify-machineinstrs -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
 
@@ -32,3 +33,10 @@ attributes #1 = { noinline optnone "thunk" }
 ; CHECK: ldr     x9, [x9]
 ; CHECK: mov     v0.16b, v16.16b
 ; CHECK: br      x9
+; CHECK-EC: mov     v7.16b, v0.16b
+; CHECK-EC: ldr     x9, [x0]
+; CHECK-EC: ldr     x11, [x9]
+; CHECK-EC: mov     v0.16b, v7.16b
+; CHECK-EC: add     x4, sp, #64
+; CHECK-EC: add     sp, sp, #64
+; CHECK-EC: br      x11

From 42c599ab365b6d413d7e35a40f6e4ab98f523b67 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Mon, 5 Feb 2024 17:26:16 +0000
Subject: [PATCH 17/46] [AArch64] Fix generated types for ARM64EC variadic
 entry thunk targets (#80595)

ISel handles filling in x4/x5 when calling variadic functions as they
don't correspond to the 5th/6th X64 arguments but rather to the end of
the shadow space on the stack and the size in bytes of all stack
parameters (ignored and written as 0 for calls from entry thunks).

Will PR a follow up with ISel handling after this is merged.
---
 .../AArch64/AArch64Arm64ECCallLowering.cpp    | 48 +++++++++++--------
 .../CodeGen/AArch64/arm64ec-entry-thunks.ll   |  4 +-
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 11248bb7aef31..91b4f18c73c93 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -43,6 +43,8 @@ static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden,
 
 namespace {
 
+enum class ThunkType { GuestExit, Entry, Exit };
+
 class AArch64Arm64ECCallLowering : public ModulePass {
 public:
   static char ID;
@@ -69,14 +71,14 @@ class AArch64Arm64ECCallLowering : public ModulePass {
   Type *I64Ty;
   Type *VoidTy;
 
-  void getThunkType(FunctionType *FT, AttributeList AttrList, bool EntryThunk,
+  void getThunkType(FunctionType *FT, AttributeList AttrList, ThunkType TT,
                     raw_ostream &Out, FunctionType *&Arm64Ty,
                     FunctionType *&X64Ty);
   void getThunkRetType(FunctionType *FT, AttributeList AttrList,
                        raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy,
                        SmallVectorImpl<Type *> &Arm64ArgTypes,
                        SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr);
-  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList,
+  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, ThunkType TT,
                         raw_ostream &Out,
                         SmallVectorImpl<Type *> &Arm64ArgTypes,
                         SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr);
@@ -89,10 +91,11 @@ class AArch64Arm64ECCallLowering : public ModulePass {
 
 void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
                                               AttributeList AttrList,
-                                              bool EntryThunk, raw_ostream &Out,
+                                              ThunkType TT, raw_ostream &Out,
                                               FunctionType *&Arm64Ty,
                                               FunctionType *&X64Ty) {
-  Out << (EntryThunk ? "$ientry_thunk$cdecl$" : "$iexit_thunk$cdecl$");
+  Out << (TT == ThunkType::Entry ? "$ientry_thunk$cdecl$"
+                                 : "$iexit_thunk$cdecl$");
 
   Type *Arm64RetTy;
   Type *X64RetTy;
@@ -102,8 +105,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
 
   // The first argument to a thunk is the called function, stored in x9.
   // For exit thunks, we pass the called function down to the emulator;
-  // for entry thunks, we just call the Arm64 function directly.
-  if (!EntryThunk)
+  // for entry/guest exit thunks, we just call the Arm64 function directly.
+  if (TT == ThunkType::Exit)
     Arm64ArgTypes.push_back(PtrTy);
   X64ArgTypes.push_back(PtrTy);
 
@@ -111,14 +114,16 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes,
                   X64ArgTypes, HasSretPtr);
 
-  getThunkArgTypes(FT, AttrList, Out, Arm64ArgTypes, X64ArgTypes, HasSretPtr);
+  getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
+                   HasSretPtr);
 
-  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes,
+                              TT == ThunkType::Entry && FT->isVarArg());
   X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
 }
 
 void AArch64Arm64ECCallLowering::getThunkArgTypes(
-    FunctionType *FT, AttributeList AttrList, raw_ostream &Out,
+    FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out,
     SmallVectorImpl<Type *> &Arm64ArgTypes,
     SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) {
 
@@ -151,14 +156,16 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
       X64ArgTypes.push_back(I64Ty);
     }
 
-    // x4
-    Arm64ArgTypes.push_back(PtrTy);
-    X64ArgTypes.push_back(PtrTy);
-    // x5
-    Arm64ArgTypes.push_back(I64Ty);
-    // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
-    // have proper isel for varargs
-    X64ArgTypes.push_back(I64Ty);
+    if (TT != ThunkType::Entry) {
+      // x4
+      Arm64ArgTypes.push_back(PtrTy);
+      X64ArgTypes.push_back(PtrTy);
+      // x5
+      Arm64ArgTypes.push_back(I64Ty);
+      // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
+      // have proper isel for varargs
+      X64ArgTypes.push_back(I64Ty);
+    }
     return;
   }
 
@@ -339,8 +346,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
   SmallString<256> ExitThunkName;
   llvm::raw_svector_ostream ExitThunkStream(ExitThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(FT, Attrs, /*EntryThunk*/ false, ExitThunkStream, Arm64Ty,
-               X64Ty);
+  getThunkType(FT, Attrs, ThunkType::Exit, ExitThunkStream, Arm64Ty, X64Ty);
   if (Function *F = M->getFunction(ExitThunkName))
     return F;
 
@@ -443,7 +449,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
   SmallString<256> EntryThunkName;
   llvm::raw_svector_ostream EntryThunkStream(EntryThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::Entry,
                EntryThunkStream, Arm64Ty, X64Ty);
   if (Function *F = M->getFunction(EntryThunkName))
     return F;
@@ -518,7 +524,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   llvm::raw_null_ostream NullThunkName;
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::GuestExit,
                NullThunkName, Arm64Ty, X64Ty);
   auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
   assert(MangledName && "Can't guest exit to function that's already native");
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 5c56f51e1ca55..0083818def151 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -147,8 +147,8 @@ define void @has_varargs(...) nounwind {
 ; CHECK-NEXT:     add     x29, sp, #160
 ; CHECK-NEXT:     .seh_add_fp     160
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     ldp     x8, x5, [x4, #32]
-; CHECK-NEXT:     mov     x4, x8
+; CHECK-NEXT:     mov     x4, sp
+; CHECK-NEXT:     mov     x5, xzr
 ; CHECK-NEXT:     blr     x9
 ; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
 ; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]

From 89d543227a324d9925c68caf5d75c8c1e46c02a4 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Tue, 27 Feb 2024 18:32:15 +0000
Subject: [PATCH 18/46] [AArch64] Skip over shadow space for ARM64EC entry
 thunk variadic calls (#80994)

When in an entry thunk the x64 SP is passed in x4 but this cannot be
directly passed through since x64 varargs calls have a 32 byte shadow
store at SP followed by the in-stack parameters. ARM64EC varargs calls
on the other hand expect x4 to point to the first in-stack parameter.
---
 .../AArch64/AArch64Arm64ECCallLowering.cpp    | 35 ++++++++++++++-----
 .../AArch64/AArch64CallingConvention.td       |  3 ++
 .../CodeGen/AArch64/arm64ec-entry-thunks.ll   |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 91b4f18c73c93..03d641d04413e 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -117,8 +117,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
                    HasSretPtr);
 
-  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes,
-                              TT == ThunkType::Entry && FT->isVarArg());
+  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+
   X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
 }
 
@@ -156,13 +156,13 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
       X64ArgTypes.push_back(I64Ty);
     }
 
+    // x4
+    Arm64ArgTypes.push_back(PtrTy);
+    X64ArgTypes.push_back(PtrTy);
+    // x5
+    Arm64ArgTypes.push_back(I64Ty);
     if (TT != ThunkType::Entry) {
-      // x4
-      Arm64ArgTypes.push_back(PtrTy);
-      X64ArgTypes.push_back(PtrTy);
-      // x5
-      Arm64ArgTypes.push_back(I64Ty);
-      // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
+      // FIXME: x5 isn't actually used by the x64 side; revisit once we
       // have proper isel for varargs
       X64ArgTypes.push_back(I64Ty);
     }
@@ -471,10 +471,11 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 
   bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
   unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
+  unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size();
 
   // Translate arguments to call.
   SmallVector<Value *> Args;
-  for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) {
+  for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
     Value *Arg = Thunk->getArg(i);
     Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
     if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
@@ -491,6 +492,22 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
     Args.push_back(Arg);
   }
 
+  if (F->isVarArg()) {
+    // The 5th argument to variadic entry thunks is used to model the x64 sp
+    // which is passed to the thunk in x4, this can be passed to the callee as
+    // the variadic argument start address after skipping over the 32 byte
+    // shadow store.
+
+    // The EC thunk CC will assign any argument marked as InReg to x4.
+    Thunk->addParamAttr(5, Attribute::InReg);
+    Value *Arg = Thunk->getArg(5);
+    Arg = IRB.CreatePtrAdd(Arg, IRB.getInt64(0x20));
+    Args.push_back(Arg);
+
+    // Pass in a zero variadic argument size (in x5).
+    Args.push_back(IRB.getInt64(0));
+  }
+
   // Call the function passed to the thunk.
   Value *Callee = Thunk->getArg(0);
   Callee = IRB.CreateBitCast(Callee, PtrTy);
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 78ea4a5180f70..8e67f0f5c8815 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -213,6 +213,9 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[
 // address is passed in X9.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // ARM64EC-specific: the InReg attribute can be used to access the x64 sp passed into entry thunks in x4 from the IR.
+  CCIfInReg<CCIfType<[i64], CCAssignToReg<[X4]>>>,
+
   // Byval aggregates are passed by pointer
   CCIfByVal<CCPassIndirect<i64>>,
 
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 0083818def151..bb9ba05f7a272 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -147,7 +147,7 @@ define void @has_varargs(...) nounwind {
 ; CHECK-NEXT:     add     x29, sp, #160
 ; CHECK-NEXT:     .seh_add_fp     160
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     mov     x4, sp
+; CHECK-NEXT:     add     x4, x4, #32
 ; CHECK-NEXT:     mov     x5, xzr
 ; CHECK-NEXT:     blr     x9
 ; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret

From c14879562f46c9a5b24af8e21e8df0eef3079d4a Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Wed, 21 Feb 2024 12:59:56 -0800
Subject: [PATCH 19/46] Unbreak *tf builtins for hexfloat (#82208)

This re-lands cc0065a7d082f0bd322a538cf62cfaef1c8f89f8 in a way that
keeps existing targets working.

---------

Original commit message:
#68132 ended up removing
__multc3 & __divtc3 from compiler-rt library builds that have
QUAD_PRECISION but not TF_MODE due to missing int128 support.
I added support for QUAD_PRECISION to use the native hex float long double representation.

---------

Co-authored-by: Sean Perry <perry@ca.ibm.com>
(cherry picked from commit 99c457dc2ef395872d7448c85609f6cb73a7f89b)
---
 compiler-rt/lib/builtins/divtc3.c    |  2 +-
 compiler-rt/lib/builtins/fp_lib.h    | 41 ++++++++++++++++++----------
 compiler-rt/lib/builtins/int_types.h |  8 ++++--
 compiler-rt/lib/builtins/multc3.c    |  2 +-
 4 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/compiler-rt/lib/builtins/divtc3.c b/compiler-rt/lib/builtins/divtc3.c
index e970cef574b21..099de5802daf0 100644
--- a/compiler-rt/lib/builtins/divtc3.c
+++ b/compiler-rt/lib/builtins/divtc3.c
@@ -13,7 +13,7 @@
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128)
 
 // Returns: the quotient of (a + ib) / (c + id)
 
diff --git a/compiler-rt/lib/builtins/fp_lib.h b/compiler-rt/lib/builtins/fp_lib.h
index af406e760497a..c4f0a5b9587f7 100644
--- a/compiler-rt/lib/builtins/fp_lib.h
+++ b/compiler-rt/lib/builtins/fp_lib.h
@@ -22,6 +22,7 @@
 
 #include "int_lib.h"
 #include "int_math.h"
+#include "int_types.h"
 #include <limits.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -93,13 +94,14 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 COMPILER_RT_ABI fp_t __adddf3(fp_t a, fp_t b);
 
 #elif defined QUAD_PRECISION
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 typedef uint64_t half_rep_t;
 typedef __uint128_t rep_t;
 typedef __int128_t srep_t;
 typedef tf_float fp_t;
 #define HALF_REP_C UINT64_C
 #define REP_C (__uint128_t)
+#if defined(CRT_HAS_IEEE_TF)
 // Note: Since there is no explicit way to tell compiler the constant is a
 // 128-bit integer, we let the constant be casted to 128-bit integer
 #define significandBits 112
@@ -188,7 +190,10 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #undef Word_HiMask
 #undef Word_LoMask
 #undef Word_FullMask
-#endif // defined(CRT_HAS_TF_MODE)
+#endif // defined(CRT_HAS_IEEE_TF)
+#else
+typedef long double fp_t;
+#endif // defined(CRT_HAS_F128) && defined(CRT_HAS_128BIT)
 #else
 #error SINGLE_PRECISION, DOUBLE_PRECISION or QUAD_PRECISION must be defined.
 #endif
@@ -196,19 +201,6 @@ static __inline void wideMultiply(rep_t a, rep_t b, rep_t *hi, rep_t *lo) {
 #if defined(SINGLE_PRECISION) || defined(DOUBLE_PRECISION) ||                  \
     (defined(QUAD_PRECISION) && defined(CRT_HAS_TF_MODE))
 #define typeWidth (sizeof(rep_t) * CHAR_BIT)
-#define exponentBits (typeWidth - significandBits - 1)
-#define maxExponent ((1 << exponentBits) - 1)
-#define exponentBias (maxExponent >> 1)
-
-#define implicitBit (REP_C(1) << significandBits)
-#define significandMask (implicitBit - 1U)
-#define signBit (REP_C(1) << (significandBits + exponentBits))
-#define absMask (signBit - 1U)
-#define exponentMask (absMask ^ significandMask)
-#define oneRep ((rep_t)exponentBias << significandBits)
-#define infRep exponentMask
-#define quietBit (implicitBit >> 1)
-#define qnanRep (exponentMask | quietBit)
 
 static __inline rep_t toRep(fp_t x) {
   const union {
@@ -226,6 +218,21 @@ static __inline fp_t fromRep(rep_t x) {
   return rep.f;
 }
 
+#if !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+#define exponentBits (typeWidth - significandBits - 1)
+#define maxExponent ((1 << exponentBits) - 1)
+#define exponentBias (maxExponent >> 1)
+
+#define implicitBit (REP_C(1) << significandBits)
+#define significandMask (implicitBit - 1U)
+#define signBit (REP_C(1) << (significandBits + exponentBits))
+#define absMask (signBit - 1U)
+#define exponentMask (absMask ^ significandMask)
+#define oneRep ((rep_t)exponentBias << significandBits)
+#define infRep exponentMask
+#define quietBit (implicitBit >> 1)
+#define qnanRep (exponentMask | quietBit)
+
 static __inline int normalize(rep_t *significand) {
   const int shift = rep_clz(*significand) - rep_clz(implicitBit);
   *significand <<= shift;
@@ -328,6 +335,8 @@ static __inline fp_t __compiler_rt_scalbnX(fp_t x, int y) {
     return fromRep(sign | ((rep_t)exp << significandBits) | sig);
 }
 
+#endif // !defined(QUAD_PRECISION) || defined(CRT_HAS_IEEE_TF)
+
 // Avoid using fmax from libm.
 static __inline fp_t __compiler_rt_fmaxX(fp_t x, fp_t y) {
   // If either argument is NaN, return the other argument. If both are NaN,
@@ -405,6 +414,8 @@ static __inline tf_float __compiler_rt_fmaxtf(tf_float x, tf_float y) {
 #define __compiler_rt_logbl crt_logbl
 #define __compiler_rt_scalbnl crt_scalbnl
 #define __compiler_rt_fmaxl crt_fmaxl
+#define crt_fabstf crt_fabsl
+#define crt_copysigntf crt_copysignl
 #else
 #error Unsupported TF mode type
 #endif
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 7624c72806151..ca97391fc2846 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -189,12 +189,16 @@ typedef long double tf_float;
 #define CRT_LDBL_IEEE_F128
 #endif
 #define TF_C(x) x##L
-#elif __LDBL_MANT_DIG__ == 113
-// Use long double instead of __float128 if it matches the IEEE 128-bit format.
+#elif __LDBL_MANT_DIG__ == 113 ||                                              \
+    (__FLT_RADIX__ == 16 && __LDBL_MANT_DIG__ == 28)
+// Use long double instead of __float128 if it matches the IEEE 128-bit format
+// or the IBM hexadecimal format.
 #define CRT_LDBL_128BIT
 #define CRT_HAS_F128
+#if __LDBL_MANT_DIG__ == 113
 #define CRT_HAS_IEEE_TF
 #define CRT_LDBL_IEEE_F128
+#endif
 typedef long double tf_float;
 #define TF_C(x) x##L
 #elif defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)
diff --git a/compiler-rt/lib/builtins/multc3.c b/compiler-rt/lib/builtins/multc3.c
index f20e53ccbf233..61a3f45e47279 100644
--- a/compiler-rt/lib/builtins/multc3.c
+++ b/compiler-rt/lib/builtins/multc3.c
@@ -15,7 +15,7 @@
 #include "int_lib.h"
 #include "int_math.h"
 
-#if defined(CRT_HAS_TF_MODE)
+#if defined(CRT_HAS_F128)
 
 // Returns: the product of a + ib and c + id
 

From c3721c1dcff5c2fa5e3b6916e369f5a499383c5a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 1 Mar 2024 11:17:22 -0800
Subject: [PATCH 20/46] [ELF] Internalize enum

g++ -flto has a diagnostic `-Wodr` about mismatched redeclarations,
which even apply to `enum`.

Fix #83529

Reviewers: thesamesam

Reviewed By: thesamesam

Pull Request: https://github.com/llvm/llvm-project/pull/83604

(cherry picked from commit 4a3f7e798a31072a80a0731b8fb1da21b9c626ed)
---
 lld/ELF/Arch/LoongArch.cpp | 2 ++
 lld/ELF/Arch/PPC64.cpp     | 3 ++-
 lld/ELF/Arch/RISCV.cpp     | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 05fd38fb753fd..1dab98115d9de 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -41,6 +41,7 @@ class LoongArch final : public TargetInfo {
 };
 } // end anonymous namespace
 
+namespace {
 enum Op {
   SUB_W = 0x00110000,
   SUB_D = 0x00118000,
@@ -65,6 +66,7 @@ enum Reg {
   R_T2 = 14,
   R_T3 = 15,
 };
+} // namespace
 
 // Mask out the input's lowest 12 bits for use with `pcalau12i`, in sequences
 // like `pcalau12i + addi.[wd]` or `pcalau12i + {ld,st}.*` where the `pcalau12i`
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index de52f6a79a40b..019c073bd541b 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -26,6 +26,7 @@ using namespace lld::elf;
 constexpr uint64_t ppc64TocOffset = 0x8000;
 constexpr uint64_t dynamicThreadPointerOffset = 0x8000;
 
+namespace {
 // The instruction encoding of bits 21-30 from the ISA for the Xform and Dform
 // instructions that can be used as part of the initial exec TLS sequence.
 enum XFormOpcd {
@@ -139,6 +140,7 @@ enum class PPCPrefixedInsn : uint64_t {
   PSTXV = PREFIX_8LS | 0xd8000000,
   PSTXVP = PREFIX_8LS | 0xf8000000
 };
+
 static bool checkPPCLegacyInsn(uint32_t encoding) {
   PPCLegacyInsn insn = static_cast<PPCLegacyInsn>(encoding);
   if (insn == PPCLegacyInsn::NOINSN)
@@ -164,7 +166,6 @@ enum class LegacyToPrefixMask : uint64_t {
       0x8000000003e00000, // S/T (6-10) - The [S/T]X bit moves from 28 to 5.
 };
 
-namespace {
 class PPC64 final : public TargetInfo {
 public:
   PPC64();
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 5fcab4d39d43a..4798c86f7d1b6 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -57,6 +57,7 @@ class RISCV final : public TargetInfo {
 
 const uint64_t dtpOffset = 0x800;
 
+namespace {
 enum Op {
   ADDI = 0x13,
   AUIPC = 0x17,
@@ -78,6 +79,7 @@ enum Reg {
   X_A0 = 10,
   X_T3 = 28,
 };
+} // namespace
 
 static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; }
 static uint32_t lo12(uint32_t val) { return val & 4095; }

From eb9bc02b06cb07ffdd4c5ee55d6b6b591d78656f Mon Sep 17 00:00:00 2001
From: Shih-Po Hung <shihpo.hung@sifive.com>
Date: Sat, 2 Mar 2024 12:33:55 +0800
Subject: [PATCH 21/46] [RISCV] Fix crash when unrolling loop containing vector
 instructions (#83384)

When MVT is not a vector type, TCK_CodeSize should return an invalid
cost. This patch adds a check in the beginning to make sure all cost
kinds return invalid costs consistently.

Before this patch, TCK_CodeSize returns a valid cost on scalar MVT but
other cost kinds doesn't.

This fixes the issue #83294 where a loop contains vector instructions
and MVT is scalar after type legalization when the vector extension is
not enabled,

(cherry picked from commit fb67dce1cb87e279593c27bd4122fe63bad75f04)
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  3 ++
 .../CostModel/RISCV/vector-cost-without-v.ll  | 53 +++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 866d5cf340e68..66dab70d455ff 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -37,6 +37,9 @@ static cl::opt<unsigned> SLPMaxVF(
 InstructionCost
 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
                                       TTI::TargetCostKind CostKind) {
+  // Check if the type is valid for all CostKind
+  if (!VT.isVector())
+    return InstructionCost::getInvalid();
   size_t NumInstr = OpCodes.size();
   if (CostKind == TTI::TCK_CodeSize)
     return NumInstr;
diff --git a/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll
new file mode 100644
index 0000000000000..cd99065f0285c
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/vector-cost-without-v.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -mtriple=riscv64 -mattr=+f,+d --passes=loop-unroll-full -S | FileCheck %s
+
+; Check it doesn't crash when the vector extension is not enabled.
+define void @foo() {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[CMP1_I_I_I:%.*]] = fcmp ogt <2 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[SPLAT_SPLAT3_I_I_I:%.*]] = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[XOR3_I_I_I_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT8_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[SUB_I_I_I:%.*]] = fsub <2 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 0, 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
+; CHECK-NEXT:    [[SPLAT_SPLAT_I_I_I_I:%.*]] = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[XOR3_I_I_I_V_I_I:%.*]] = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv1 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %0 = load float, ptr null, align 4
+  %splat.splat.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %cmp1.i.i.i = fcmp ogt <2 x float> zeroinitializer, zeroinitializer
+  %splat.splat3.i.i.i = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %xor3.i.i.i.i.i = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %1 = load float, ptr null, align 4
+  %splat.splat8.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %sub.i.i.i = fsub <2 x float> zeroinitializer, zeroinitializer
+  %mul.i.i.i = shl i64 0, 0
+  %2 = load float, ptr null, align 4
+  %splat.splat.i.i.i.i = shufflevector <2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x i32> zeroinitializer
+  %xor3.i.i.i.v.i.i = select <2 x i1> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer
+  %indvars.iv.next = add i64 %indvars.iv1, 1
+  %exitcond = icmp ne i64 %indvars.iv1, 8
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}

From d8352e93c1c8042d9166eab3d76d6c07ef585b6d Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 27 Feb 2024 20:19:44 +0100
Subject: [PATCH 22/46] [Clang] [Sema] Handle placeholders in '.*' expressions
 (#83103)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When analysing whether we should handle a binary expression as an
overloaded operator call or a builtin operator, we were calling
`checkPlaceholderForOverload()`, which takes care of any placeholders
that are not overload sets—which would usually make sense since those
need to be handled as part of overload resolution.

Unfortunately, we were also doing that for `.*`, which is not
overloadable, and then proceeding to create a builtin operator anyway,
which would crash if the RHS happened to be an unresolved overload set
(due hitting an assertion in `CreateBuiltinBinOp()`—specifically, in one
of its callees—in the `.*` case that makes sure its arguments aren’t
placeholders).

This pr instead makes it so we check for *all* placeholders early if the
operator is `.*`.

It’s worth noting that,
1. In the `.*` case, we now additionally also check for *any*
placeholders (not just non-overload-sets) in the LHS; this shouldn’t
make a difference, however—at least I couldn’t think of a way to trigger
the assertion with an overload set as the LHS of `.*`; it is worth
noting that the assertion in question would also complain if the LHS
happened to be of placeholder type, though.
2. There is another case in which we also don’t perform overload
resolution—namely `=` if the LHS is not of class or enumeration type
after handling non-overload-set placeholders—as in the `.*` case, but
similarly to 1., I first couldn’t think of a way of getting this case to
crash, and secondly, `CreateBuiltinBinOp()` doesn’t seem to care about
placeholders in the LHS or RHS in the `=` case (from what I can tell,
it, or rather one of its callees, only checks that the LHS is not a
pseudo-object type, but those will have already been handled by the call
to `checkPlaceholderForOverload()` by the time we get to this function),
so I don’t think this case suffers from the same problem.

This fixes #53815.

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/ReleaseNotes.rst     |  2 ++
 clang/lib/Sema/SemaOverload.cpp | 22 +++++++++++++++++-----
 clang/test/SemaCXX/gh53815.cpp  | 21 +++++++++++++++++++++
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaCXX/gh53815.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index fc27297aea2d6..101b3a54b9af2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1103,6 +1103,8 @@ Bug Fixes to C++ Support
   (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
 - Correctly immediate-escalate lambda conversion functions.
   (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
+- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
+  (`#53815 <https://github.com/llvm/llvm-project/issues/53815>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 940bcccb9e261..b708272ebe7d8 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14470,6 +14470,23 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                        CurFPFeatureOverrides());
   }
 
+  // If this is the .* operator, which is not overloadable, just
+  // create a built-in binary operator.
+  if (Opc == BO_PtrMemD) {
+    auto CheckPlaceholder = [&](Expr *&Arg) {
+      ExprResult Res = CheckPlaceholderExpr(Arg);
+      if (Res.isUsable())
+        Arg = Res.get();
+      return !Res.isUsable();
+    };
+
+    // CreateBuiltinBinOp() doesn't like it if we tell it to create a '.*'
+    // expression that contains placeholders (in either the LHS or RHS).
+    if (CheckPlaceholder(Args[0]) || CheckPlaceholder(Args[1]))
+      return ExprError();
+    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
+  }
+
   // Always do placeholder-like conversions on the RHS.
   if (checkPlaceholderForOverload(*this, Args[1]))
     return ExprError();
@@ -14489,11 +14506,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
   if (Opc == BO_Assign && !Args[0]->getType()->isOverloadableType())
     return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
 
-  // If this is the .* operator, which is not overloadable, just
-  // create a built-in binary operator.
-  if (Opc == BO_PtrMemD)
-    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
-
   // Build the overload set.
   OverloadCandidateSet CandidateSet(OpLoc, OverloadCandidateSet::CSK_Operator,
                                     OverloadCandidateSet::OperatorRewriteInfo(
diff --git a/clang/test/SemaCXX/gh53815.cpp b/clang/test/SemaCXX/gh53815.cpp
new file mode 100644
index 0000000000000..326c911c7bfaf
--- /dev/null
+++ b/clang/test/SemaCXX/gh53815.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+// expected-no-diagnostics
+
+// Check that we don't crash due to forgetting to check for placeholders
+// in the RHS of '.*'.
+
+template <typename Fn>
+static bool has_explicitly_named_overload() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+int main() {
+  has_explicitly_named_overload<decltype([](auto){})>();
+}
+
+template <typename Fn>
+constexpr bool has_explicitly_named_overload_2() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+static_assert(!has_explicitly_named_overload_2<decltype([](auto){})>());

From 78859f118a6b4c7b06c543c7794c59befdc78924 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Tue, 5 Mar 2024 15:50:14 +0800
Subject: [PATCH 23/46] [lld][LoongArch] Support the R_LARCH_{ADD,SUB}_ULEB128
 relocation types (#81133)

For a label difference like `.uleb128 A-B`, MC generates a pair of
R_LARCH_{ADD,SUB}_ULEB128 if A-B cannot be folded as a constant. GNU
assembler generates a pair of relocations in more cases (when A or B is
in a code section with linker relaxation). It is similar to RISCV.

R_LARCH_{ADD,SUB}_ULEB128 relocations are created by Clang and GCC in
`.gcc_except_table` and other debug sections with linker relaxation
enabled. On LoongArch, first read the buf and count the available space.
Then add or sub the value. Finally truncate the expected value and fill
it into the available space.

(cherry picked from commit eaa9ef678c63bf392ec2d5b736605db7ea7e7338)
---
 lld/ELF/Arch/LoongArch.cpp            |  19 +++++
 lld/test/ELF/loongarch-reloc-leb128.s | 102 ++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 lld/test/ELF/loongarch-reloc-leb128.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 1dab98115d9de..8a6f6db68f290 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -155,6 +156,16 @@ static bool isJirl(uint32_t insn) {
   return (insn & 0xfc000000) == JIRL;
 }
 
+static void handleUleb128(uint8_t *loc, uint64_t val) {
+  const uint32_t maxcount = 1 + 64 / 7;
+  uint32_t count;
+  uint64_t orig = decodeULEB128(loc, &count);
+  if (count > maxcount)
+    errorOrWarn(getErrorLocation(loc) + "extra space for uleb128");
+  uint64_t mask = count < maxcount ? (1ULL << 7 * count) - 1 : -1ULL;
+  encodeULEB128((orig + val) & mask, loc, count);
+}
+
 LoongArch::LoongArch() {
   // The LoongArch ISA itself does not have a limit on page sizes. According to
   // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is
@@ -396,11 +407,13 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_ADD16:
   case R_LARCH_ADD32:
   case R_LARCH_ADD64:
+  case R_LARCH_ADD_ULEB128:
   case R_LARCH_SUB6:
   case R_LARCH_SUB8:
   case R_LARCH_SUB16:
   case R_LARCH_SUB32:
   case R_LARCH_SUB64:
+  case R_LARCH_SUB_ULEB128:
     // The LoongArch add/sub relocs behave like the RISCV counterparts; reuse
     // the RelExpr to avoid code duplication.
     return R_RISCV_ADD;
@@ -635,6 +648,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_ADD64:
     write64le(loc, read64le(loc) + val);
     return;
+  case R_LARCH_ADD_ULEB128:
+    handleUleb128(loc, val);
+    return;
   case R_LARCH_SUB6:
     *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f);
     return;
@@ -650,6 +666,9 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
   case R_LARCH_SUB64:
     write64le(loc, read64le(loc) - val);
     return;
+  case R_LARCH_SUB_ULEB128:
+    handleUleb128(loc, -val);
+    return;
 
   case R_LARCH_MARK_LA:
   case R_LARCH_MARK_PCREL:
diff --git a/lld/test/ELF/loongarch-reloc-leb128.s b/lld/test/ELF/loongarch-reloc-leb128.s
new file mode 100644
index 0000000000000..9e6f221e62b63
--- /dev/null
+++ b/lld/test/ELF/loongarch-reloc-leb128.s
@@ -0,0 +1,102 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax a.s -o a.o
+# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.o | FileCheck %s --check-prefix=REL
+# RUN: ld.lld -shared --gc-sections a.o -o a.so
+# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a.so | FileCheck %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax a.s -o a32.o
+# RUN: llvm-readobj -r -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.o | FileCheck %s --check-prefix=REL
+# RUN: ld.lld -shared --gc-sections a32.o -o a32.so
+# RUN: llvm-readelf -x .gcc_except_table -x .debug_rnglists -x .debug_loclists a32.so | FileCheck %s
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax extraspace.s -o extraspace32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax extraspace.s -o extraspace64.o
+# RUN: not ld.lld -shared extraspace32.o 2>&1 | FileCheck %s --check-prefix=ERROR
+# RUN: not ld.lld -shared extraspace64.o 2>&1 | FileCheck %s --check-prefix=ERROR
+# ERROR: error: extraspace{{.*}}.o:(.rodata+0x0): extra space for uleb128
+
+#--- a.s
+.cfi_startproc
+.cfi_lsda 0x1b,.LLSDA0
+.cfi_endproc
+
+.section .text.w,"axR"
+break 0; break 0; break 0; w1:
+  .p2align 4    # 4 bytes after relaxation
+w2: break 0
+
+.section .text.x,"ax"
+break 0; break 0; break 0; x1:
+  .p2align 4    # 4 bytes after relaxation
+x2: break 0
+
+.section .gcc_except_table,"a"
+.LLSDA0:
+.uleb128 w2-w1+116                   # initial value: 0x0080
+.uleb128 w1-w2+141                   # initial value: 0x0080
+.uleb128 w2-w1+16372                 # initial value: 0x008080
+.uleb128 w1-w2+16397                 # initial value: 0x008080
+.uleb128 w2-w1+2097140               # initial value: 0x00808080
+.uleb128 w1-w2+2097165               # initial value: 0x00808080
+
+.section .debug_rnglists
+.uleb128 w2-w1+116                   # initial value: 0x0080
+.uleb128 w1-w2+141                   # initial value: 0x0080
+.uleb128 w2-w1+16372                 # initial value: 0x008080
+.uleb128 w1-w2+16397                 # initial value: 0x008080
+.uleb128 w2-w1+2097140               # initial value: 0x00808080
+.uleb128 w1-w2+2097165               # initial value: 0x00808080
+
+.section .debug_loclists
+.uleb128 x2-x1                       # references discarded symbols
+
+# REL:      Section ({{.*}}) .rela.debug_rnglists {
+# REL-NEXT:   0x0 R_LARCH_ADD_ULEB128 w2 0x74
+# REL-NEXT:   0x0 R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0x2 R_LARCH_ADD_ULEB128 w1 0x8D
+# REL-NEXT:   0x2 R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT:   0x4 R_LARCH_ADD_ULEB128 w2 0x3FF4
+# REL-NEXT:   0x4 R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0x7 R_LARCH_ADD_ULEB128 w1 0x400D
+# REL-NEXT:   0x7 R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT:   0xA R_LARCH_ADD_ULEB128 w2 0x1FFFF4
+# REL-NEXT:   0xA R_LARCH_SUB_ULEB128 w1 0x0
+# REL-NEXT:   0xE R_LARCH_ADD_ULEB128 w1 0x20000D
+# REL-NEXT:   0xE R_LARCH_SUB_ULEB128 w2 0x0
+# REL-NEXT: }
+# REL:      Section ({{.*}}) .rela.debug_loclists {
+# REL-NEXT:   0x0 R_LARCH_ADD_ULEB128 x2 0x0
+# REL-NEXT:   0x0 R_LARCH_SUB_ULEB128 x1 0x0
+# REL-NEXT: }
+
+# REL:      Hex dump of section '.gcc_except_table':
+# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 .
+# REL-NEXT: 0x00000010 8000                                .
+# REL:      Hex dump of section '.debug_rnglists':
+# REL-NEXT: 0x00000000 80008000 80800080 80008080 80008080 .
+# REL-NEXT: 0x00000010 8000                                .
+# REL:      Hex dump of section '.debug_loclists':
+# REL-NEXT: 0x00000000 00                                  .
+
+# CHECK:      Hex dump of section '.gcc_except_table':
+# CHECK-NEXT: 0x[[#%x,]] f8008901 f8ff0089 8001f8ff ff008980 .
+# CHECK-NEXT: 0x[[#%x,]] 8001                                .
+# CHECK:      Hex dump of section '.debug_rnglists':
+# CHECK-NEXT: 0x00000000 f8008901 f8ff0089 8001f8ff ff008980 .
+# CHECK-NEXT: 0x00000010 8001                                .
+# CHECK:      Hex dump of section '.debug_loclists':
+# CHECK-NEXT: 0x00000000 00                                  .
+
+#--- extraspace.s
+.text
+w1:
+  la.pcrel $t0, w1
+w2:
+
+.rodata
+.reloc ., R_LARCH_ADD_ULEB128, w2
+.reloc ., R_LARCH_SUB_ULEB128, w1
+.fill 10, 1, 0x80
+.byte 0

From 55193c2ba53f4156481b63b5956eaadd8edb0877 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 5 Mar 2024 17:21:16 +0800
Subject: [PATCH 24/46] [InstCombine] Handle scalable splat in
 `getFlippedStrictnessPredicateAndConstant`

(cherry picked from commit d51fcd4ed86ac6075c8a25b053c2b66051feaf62)
---
 .../InstCombine/InstCombineCompares.cpp           |  7 +++++++
 llvm/test/Transforms/InstCombine/pr83931.ll       | 15 +++++++++++++++
 llvm/test/Transforms/InstCombine/select.ll        |  2 +-
 llvm/test/Transforms/InstCombine/vscale_cmp.ll    |  2 +-
 4 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/pr83931.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 8c0fd66225513..9973a80a7db94 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6491,6 +6491,13 @@ InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
       if (!SafeReplacementConstant)
         SafeReplacementConstant = CI;
     }
+  } else if (isa<VectorType>(C->getType())) {
+    // Handle scalable splat
+    Value *SplatC = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(SplatC);
+    // Bail out if the constant can't be safely incremented/decremented.
+    if (!CI || !ConstantIsOk(CI))
+      return std::nullopt;
   } else {
     // ConstantExpr?
     return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/pr83931.ll b/llvm/test/Transforms/InstCombine/pr83931.ll
new file mode 100644
index 0000000000000..d36ac8d91abd3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr83931.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define <vscale x 2 x i1> @dont_crash(<vscale x 2 x i64> %x) {
+; CHECK-LABEL: define <vscale x 2 x i1> @dont_crash(
+; CHECK-SAME: <vscale x 2 x i64> [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <vscale x 2 x i64> [[X]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -309383, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i1> [[RET]]
+;
+entry:
+  %div = sdiv <vscale x 2 x i64> %x, splat (i64 309383)
+  %ret = icmp sge <vscale x 2 x i64> %div, zeroinitializer
+  ret <vscale x 2 x i1> %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index b7e743c14a52c..888e7d28f78af 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3459,7 +3459,7 @@ define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
 ; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 57, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/vscale_cmp.ll b/llvm/test/Transforms/InstCombine/vscale_cmp.ll
index a7f8368c5d62c..b2bfc93da089f 100644
--- a/llvm/test/Transforms/InstCombine/vscale_cmp.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_cmp.ll
@@ -3,7 +3,7 @@
 
 define <vscale x 2 x i1> @sge(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @sge(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge <vscale x 2 x i8> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -1, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
   %cmp = icmp sge <vscale x 2 x i8> %x, zeroinitializer

From c14bf0a13d426b0b8fc2bc395bf450d9a6982fe3 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 7 Mar 2024 15:12:21 -0500
Subject: [PATCH 25/46] [libc++] Enable availability based on the compiler
 instead of __has_extension (#84065)

__has_extension(...) doesn't work as intended when -pedantic-errors is
used with Clang. With that flag, __has_extension(...) is equivalent to
__has_feature(...), which means that checks like

    __has_extension(pragma_clang_attribute_external_declaration)

will return 0. In turn, this has the effect of disabling availability
markup in libc++, which is undesirable.

rdar://124078119
(cherry picked from commit 292a28df6c55679fad0589dea35278a8c66b2ae1)
---
 libcxx/include/__availability                 |  7 +++---
 ...lity-with-pedantic-errors.compile.pass.cpp | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp

diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index c5069a027750e..b8b2da9bb1226 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -72,11 +72,10 @@
 #  endif
 #endif
 
-// Availability markup is disabled when building the library, or when the compiler
+// Availability markup is disabled when building the library, or when a non-Clang
+// compiler is used because only Clang supports the necessary attributes.
 // doesn't support the proper attributes.
-#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                       \
-    !__has_feature(attribute_availability_with_strict) || !__has_feature(attribute_availability_in_templates) ||       \
-    !__has_extension(pragma_clang_attribute_external_declaration)
+#if defined(_LIBCPP_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY) || !defined(_LIBCPP_COMPILER_CLANG_BASED)
 #  if !defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 #    define _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 #  endif
diff --git a/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
new file mode 100644
index 0000000000000..c55a0a4d6e5d1
--- /dev/null
+++ b/libcxx/test/libcxx/vendor/apple/availability-with-pedantic-errors.compile.pass.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: stdlib=apple-libc++
+
+// Test that using -pedantic-errors doesn't turn off availability annotations.
+// This used to be the case because we used __has_extension(...) to enable the
+// availability annotations, and -pedantic-errors changes the behavior of
+// __has_extension(...) in an incompatible way.
+
+// ADDITIONAL_COMPILE_FLAGS: -pedantic-errors
+
+#include <__availability>
+
+#if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#  error Availability annotations should be enabled on Apple platforms in the system configuration!
+#endif

From 1de8ea75d9b309fd14e9f1be86ea5079d9a53d69 Mon Sep 17 00:00:00 2001
From: Exile <2094247798@qq.com>
Date: Thu, 7 Mar 2024 00:01:30 +0800
Subject: [PATCH 26/46] [analyzer] Fix crash on dereference invalid return
 value of getAdjustedParameterIndex() (#83585)

Fixes #78810
Thanks for Snape3058 's comment

---------

Co-authored-by: miaozhiyuan <miaozhiyuan@feysh.com>
(cherry picked from commit d4687fe7d1639ea5d16190c89a54de1f2c6e2a9a)
---
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp |  2 +-
 clang/test/Analysis/cxx2b-deducing-this.cpp | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 0ac1d91b79beb..bc14aea27f673 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -1409,7 +1409,7 @@ CallEventManager::getSimpleCall(const CallExpr *CE, ProgramStateRef State,
   if (const auto *OpCE = dyn_cast<CXXOperatorCallExpr>(CE)) {
     const FunctionDecl *DirectCallee = OpCE->getDirectCallee();
     if (const auto *MD = dyn_cast<CXXMethodDecl>(DirectCallee))
-      if (MD->isInstance())
+      if (MD->isImplicitObjectMemberFunction())
         return create<CXXMemberOperatorCall>(OpCE, State, LCtx, ElemRef);
 
   } else if (CE->getCallee()->getType()->isBlockPointerType()) {
diff --git a/clang/test/Analysis/cxx2b-deducing-this.cpp b/clang/test/Analysis/cxx2b-deducing-this.cpp
index d22a897097bec..2ec9e96bf0f84 100644
--- a/clang/test/Analysis/cxx2b-deducing-this.cpp
+++ b/clang/test/Analysis/cxx2b-deducing-this.cpp
@@ -60,3 +60,14 @@ void top() {
   s.c();
   s.c(11);
 }
+
+
+struct S2 {
+  bool operator==(this auto, S2) {
+    return true;
+  }
+};
+void use_deducing_this() {
+  int result = S2{} == S2{}; // no-crash
+  clang_analyzer_dump(result); // expected-warning {{1 S32b}}
+}

From d77c5c3830d925b3795e2f1535a6568399fe6626 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Mon, 11 Mar 2024 08:59:17 +0800
Subject: [PATCH 27/46] [LoongArch] Make sure that the LoongArchISD::BSTRINS
 node uses the correct `MSB` value (#84454)

The `MSB` must not be greater than `GRLen`. Without this patch, newly
added test cases will crash with LoongArch32, resulting in a 'cannot
select' error.

(cherry picked from commit edd4c6c6dca4c556de22b2ab73d5bfc02d28e59b)
---
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp |  4 +++-
 llvm/test/CodeGen/LoongArch/bstrins_w.ll            | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index b161c5434ca13..907aae13d6de0 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2343,7 +2343,9 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(
         LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
         DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
-        DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+        DAG.getConstant(ValBits == 32 ? (MaskIdx0 + (MaskLen0 & 31) - 1)
+                                      : (MaskIdx0 + MaskLen0 - 1),
+                        DL, GRLenVT),
         DAG.getConstant(MaskIdx0, DL, GRLenVT));
   }
 
diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
index dfbe000841cdc..e008caacad2a1 100644
--- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
@@ -145,6 +145,19 @@ define i32 @pat5(i32 %a) nounwind {
   ret i32 %or
 }
 
+;; The high bits of `const` are zero.
+define i32 @pat5_high_zeros(i32 %a) nounwind {
+; CHECK-LABEL: pat5_high_zeros:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 1
+; CHECK-NEXT:    ori $a1, $a1, 564
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535      ; 0x0000ffff
+  %or = or i32 %and, 305397760  ; 0x12340000
+  ret i32 %or
+}
+
 ;; Pattern 6: a = b | ((c & mask) << shamt)
 ;; In this testcase b is 0x10000002, but in fact we do not require b being a
 ;; constant. As long as all positions in b to be overwritten by the incoming

From a9ba36c7e7d7fa076f201843e3b826b6c6d7f5ef Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 5 Mar 2024 19:44:28 +0800
Subject: [PATCH 28/46] [Clang][LoongArch] Precommit test for fix wrong return
 value type of __iocsrrd_h. NFC

(cherry picked from commit aeda1a6e800e0dd6c91c0332b4db95094ad5b301)
---
 clang/test/CodeGen/LoongArch/intrinsic-la32.c | 29 ++++++++++++++-----
 clang/test/CodeGen/LoongArch/intrinsic-la64.c | 21 ++++++++++++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
index 93d54f511a9cd..6a8d99880be39 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
@@ -169,8 +169,8 @@ unsigned int cpucfg(unsigned int a) {
 
 // LA32-LABEL: @rdtime(
 // LA32-NEXT:  entry:
-// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
-// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
+// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
 // LA32-NEXT:    ret void
 //
 void rdtime() {
@@ -201,13 +201,28 @@ void loongarch_movgr2fcsr(int a) {
   __builtin_loongarch_movgr2fcsr(1, a);
 }
 
-// CHECK-LABEL: @cacop_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
-// CHECK-NEXT:    ret void
+// LA32-LABEL: @cacop_w(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
+// LA32-NEXT:    ret void
 //
 void cacop_w(unsigned long int a) {
   __cacop_w(1, a, 1024);
   __builtin_loongarch_cacop_w(1, a, 1024);
 }
+
+// LA32-LABEL: @iocsrrd_h_result(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// LA32-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// LA32-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
+// LA32-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
+// LA32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
+// LA32-NEXT:    ret i16 [[CONV4]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
index a740882eef541..48b6a7a3d2270 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
@@ -387,7 +387,7 @@ unsigned int cpucfg(unsigned int a) {
 
 // CHECK-LABEL: @rdtime_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime_d() {
@@ -396,8 +396,8 @@ void rdtime_d() {
 
 // CHECK-LABEL: @rdtime(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META4:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime() {
@@ -427,3 +427,18 @@ void loongarch_movgr2fcsr(int a) {
   __movgr2fcsr(1, a);
   __builtin_loongarch_movgr2fcsr(1, a);
 }
+
+// CHECK-LABEL: @iocsrrd_h_result(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// CHECK-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
+// CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-NEXT:    ret i16 [[CONV4]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}

From 9b9aee16d4dcf1b4af49988ebd7918fa4ce77e44 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Wed, 6 Mar 2024 10:03:28 +0800
Subject: [PATCH 29/46] [Clang][LoongArch] Fix wrong return value type of
 __iocsrrd_h (#84100)

relate:
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/645016.html
(cherry picked from commit 2f479b811274fede36535e34ecb545ac22e399c3)
---
 clang/lib/Headers/larchintrin.h               | 2 +-
 clang/test/CodeGen/LoongArch/intrinsic-la32.c | 8 ++++----
 clang/test/CodeGen/LoongArch/intrinsic-la64.c | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index a613e5ca0e5ec..f4218295919a0 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -156,7 +156,7 @@ extern __inline unsigned char
   return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
 }
 
-extern __inline unsigned char
+extern __inline unsigned short
     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     __iocsrrd_h(unsigned int _1) {
   return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
index 6a8d99880be39..eb3f8cbe7ac4c 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
@@ -215,11 +215,11 @@ void cacop_w(unsigned long int a) {
 // LA32-LABEL: @iocsrrd_h_result(
 // LA32-NEXT:  entry:
 // LA32-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// LA32-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
 // LA32-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
-// LA32-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
-// LA32-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
-// LA32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
-// LA32-NEXT:    ret i16 [[CONV4]]
+// LA32-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// LA32-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// LA32-NEXT:    ret i16 [[CONV3]]
 //
 unsigned short iocsrrd_h_result(unsigned int a) {
   unsigned short b = __iocsrrd_h(a);
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
index 48b6a7a3d2270..50ec358f546ec 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
@@ -431,11 +431,11 @@ void loongarch_movgr2fcsr(int a) {
 // CHECK-LABEL: @iocsrrd_h_result(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
-// CHECK-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
-// CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK-NEXT:    ret i16 [[CONV4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// CHECK-NEXT:    ret i16 [[CONV3]]
 //
 unsigned short iocsrrd_h_result(unsigned int a) {
   unsigned short b = __iocsrrd_h(a);

From 3f8711fc5e01685f0a751ef296d16cf9a1f4fd4d Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 5 Mar 2024 22:34:04 +0800
Subject: [PATCH 30/46] [InstCombine] Fix miscompilation in PR83947 (#83993)

https://github.com/llvm/llvm-project/blob/762f762504967efbe159db5c737154b989afc9bb/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp#L394-L407

Comment from @topperc:
> This transforms assumes the mask is a non-zero splat. We only know its
a splat and not provably all 0s. The mask is a constexpr that includes
the address of the global variable. We can't resolve the constant
expression to an exact value.

Fixes #83947.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |  5 ++
 llvm/lib/Analysis/VectorUtils.cpp             | 25 +++++++
 .../InstCombine/InstCombineCalls.cpp          | 13 ++--
 .../InstCombine/masked_intrinsics.ll          |  6 +-
 llvm/test/Transforms/InstCombine/pr83947.ll   | 67 +++++++++++++++++++
 5 files changed, 110 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/pr83947.ll

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 7a92e62b53c53..c6eb66cc9660c 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -406,6 +406,11 @@ bool maskIsAllZeroOrUndef(Value *Mask);
 /// lanes can be assumed active.
 bool maskIsAllOneOrUndef(Value *Mask);
 
+/// Given a mask vector of i1, Return true if any of the elements of this
+/// predicate mask are known to be true or undef.  That is, return true if at
+/// least one lane can be assumed active.
+bool maskContainsAllOneOrUndef(Value *Mask);
+
 /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
 /// for each lane which may be active.
 APInt possiblyDemandedEltsInMask(Value *Mask);
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 73facc76a92b2..bf7bc0ba84a03 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1012,6 +1012,31 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) {
   return true;
 }
 
+bool llvm::maskContainsAllOneOrUndef(Value *Mask) {
+  assert(isa<VectorType>(Mask->getType()) &&
+         isa<IntegerType>(Mask->getType()->getScalarType()) &&
+         cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
+             1 &&
+         "Mask must be a vector of i1");
+
+  auto *ConstMask = dyn_cast<Constant>(Mask);
+  if (!ConstMask)
+    return false;
+  if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
+    return true;
+  if (isa<ScalableVectorType>(ConstMask->getType()))
+    return false;
+  for (unsigned
+           I = 0,
+           E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
+       I != E; ++I) {
+    if (auto *MaskElt = ConstMask->getAggregateElement(I))
+      if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
+        return true;
+  }
+  return false;
+}
+
 /// TODO: This is a lot like known bits, but for
 /// vectors.  Is there something we can common this with?
 APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a647be2d26c76..bc43edb5e6206 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -412,11 +412,14 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) {
     // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
     if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
-      Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
-      StoreInst *S =
-          new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment);
-      S->copyMetadata(II);
-      return S;
+      if (maskContainsAllOneOrUndef(ConstMask)) {
+        Align Alignment =
+            cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+        StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false,
+                                     Alignment);
+        S->copyMetadata(II);
+        return S;
+      }
     }
     // scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
     // lastlane), ptr
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 2704905f7a358..c87c1199f727e 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -292,7 +292,11 @@ entry:
 define void @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask(ptr %dst, i16 %val) {
 ; CHECK-LABEL: @scatter_nxv4i16_uniform_vals_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i16 [[VAL:%.*]], ptr [[DST:%.*]], align 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_VALUE:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[VAL:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATVALUE:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_VALUE]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[BROADCAST_SPLATVALUE]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> zeroinitializer, i1 true, i32 0), <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
new file mode 100644
index 0000000000000..c1d601ff63718
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+@c = global i32 0, align 4
+@b = global i32 0, align 4
+
+define void @masked_scatter1() {
+; CHECK-LABEL: define void @masked_scatter1() {
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> splat (ptr @c), i32 4, <vscale x 4 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
+  ret void
+}
+
+define void @masked_scatter2() {
+; CHECK-LABEL: define void @masked_scatter2() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 true))
+  ret void
+}
+
+define void @masked_scatter3() {
+; CHECK-LABEL: define void @masked_scatter3() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> undef)
+  ret void
+}
+
+define void @masked_scatter4() {
+; CHECK-LABEL: define void @masked_scatter4() {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 false))
+  ret void
+}
+
+define void @masked_scatter5() {
+; CHECK-LABEL: define void @masked_scatter5() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> <i1 true, i1 false>)
+  ret void
+}
+
+define void @masked_scatter6() {
+; CHECK-LABEL: define void @masked_scatter6() {
+; CHECK-NEXT:    store i32 0, ptr @c, align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> <i1 undef, i1 false>)
+  ret void
+}
+
+define void @masked_scatter7() {
+; CHECK-LABEL: define void @masked_scatter7() {
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c)>)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
+  ret void
+}

From 39e3ba8a383e05af376d613594373c482f72bb3e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 6 Mar 2024 20:08:00 +0000
Subject: [PATCH 31/46] [DSE] Remove malloc from EarliestEscapeInfo before
 removing. (#84157)

Not removing the malloc from earliest escape info leaves stale entries
in the cache.

Fixes https://github.com/llvm/llvm-project/issues/84051.

PR: https://github.com/llvm/llvm-project/pull/84157
(cherry picked from commit eb8f379567e8d014194faefe02ce92813e237afc)
---
 .../Scalar/DeadStoreElimination.cpp           |   4 +-
 ...alloc-earliest-escape-info-invalidation.ll | 302 ++++++++++++++++++
 2 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll

diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 340fba4fb9c5a..380d658365536 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1907,15 +1907,15 @@ struct DSEState {
                               Malloc->getArgOperand(0), IRB, TLI);
     if (!Calloc)
       return false;
+
     MemorySSAUpdater Updater(&MSSA);
     auto *NewAccess =
       Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), nullptr,
                                       MallocDef);
     auto *NewAccessMD = cast<MemoryDef>(NewAccess);
     Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
-    Updater.removeMemoryAccess(Malloc);
     Malloc->replaceAllUsesWith(Calloc);
-    Malloc->eraseFromParent();
+    deleteDeadInstruction(Malloc);
     return true;
   }
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll
new file mode 100644
index 0000000000000..60a010cd49ced
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/malloc-earliest-escape-info-invalidation.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p dse -S %s | FileCheck %s
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+
+define void @widget(ptr %a) {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call noalias ptr @malloc(i64 0)
+; CHECK-NEXT:    store ptr [[CALL1]], ptr [[A]], align 8
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[A]], align 8
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 8
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[GETELEMENTPTR4]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr inttoptr (i64 4 to ptr), align 4
+; CHECK-NEXT:    br label [[BB48:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    br label [[BB9:%.*]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB53:%.*]]
+; CHECK:       bb9:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[CALL1]], [[BB7:%.*]] ], [ [[A]], [[BB43:%.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR10:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR11:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR12:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 1
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 1
+; CHECK-NEXT:    br label [[BB29:%.*]]
+; CHECK:       bb14:
+; CHECK-NEXT:    [[GETELEMENTPTR15:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR10]], i64 8
+; CHECK-NEXT:    [[LOAD16:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br i1 false, label [[BB22:%.*]], label [[BB17:%.*]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[GETELEMENTPTR18:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR11]], i64 8
+; CHECK-NEXT:    [[LOAD19:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR20:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR21:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    br label [[BB25:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[GETELEMENTPTR23:%.*]] = getelementptr i8, ptr [[PHI]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR24:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR23]], i64 12
+; CHECK-NEXT:    br label [[BB25]]
+; CHECK:       bb25:
+; CHECK-NEXT:    [[PHI26:%.*]] = phi ptr [ [[A]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    [[PHI27:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    [[PHI28:%.*]] = phi ptr [ [[CALL1]], [[BB17]] ], [ [[CALL1]], [[BB22]] ]
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB29]]
+; CHECK:       bb29:
+; CHECK-NEXT:    [[PHI30:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ]
+; CHECK-NEXT:    [[PHI31:%.*]] = phi ptr [ [[CALL1]], [[BB9]] ], [ [[CALL1]], [[BB25]] ]
+; CHECK-NEXT:    [[LOAD32:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD33:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR34:%.*]] = getelementptr i8, ptr [[PHI31]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR35:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12
+; CHECK-NEXT:    br label [[BB86:%.*]]
+; CHECK:       bb36:
+; CHECK-NEXT:    [[GETELEMENTPTR37:%.*]] = getelementptr i8, ptr [[PHI30]], i64 12
+; CHECK-NEXT:    br label [[BB38:%.*]]
+; CHECK:       bb38:
+; CHECK-NEXT:    [[GETELEMENTPTR39:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD40:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR41:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR37]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD42:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB38]]
+; CHECK:       bb43:
+; CHECK-NEXT:    [[GETELEMENTPTR44:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 8
+; CHECK-NEXT:    [[LOAD45:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR46:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR47:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR12]], i64 16
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb48:
+; CHECK-NEXT:    [[GETELEMENTPTR49:%.*]] = getelementptr i8, ptr [[CALL1]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR50:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR51:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 8
+; CHECK-NEXT:    [[GETELEMENTPTR52:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR49]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB48]]
+; CHECK:       bb53:
+; CHECK-NEXT:    [[PHI54:%.*]] = phi ptr [ [[CALL1]], [[BB8:%.*]] ], [ [[A]], [[BB71:%.*]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR55:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR56:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR57:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR58:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 1
+; CHECK-NEXT:    br label [[BB71]]
+; CHECK:       bb59:
+; CHECK-NEXT:    [[GETELEMENTPTR60:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    [[GETELEMENTPTR61:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR60]], i64 12
+; CHECK-NEXT:    br label [[BB67:%.*]]
+; CHECK:       bb62:
+; CHECK-NEXT:    [[GETELEMENTPTR63:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR56]], i64 8
+; CHECK-NEXT:    [[LOAD64:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR65:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR66:%.*]] = getelementptr i8, ptr [[PHI54]], i64 0
+; CHECK-NEXT:    br label [[BB67]]
+; CHECK:       bb67:
+; CHECK-NEXT:    [[PHI68:%.*]] = phi ptr [ [[A]], [[BB62:%.*]] ], [ [[CALL1]], [[BB59:%.*]] ]
+; CHECK-NEXT:    [[PHI69:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ]
+; CHECK-NEXT:    [[PHI70:%.*]] = phi ptr [ [[CALL1]], [[BB62]] ], [ [[CALL1]], [[BB59]] ]
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB71]]
+; CHECK:       bb71:
+; CHECK-NEXT:    [[PHI72:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ]
+; CHECK-NEXT:    [[PHI73:%.*]] = phi ptr [ [[CALL1]], [[BB53]] ], [ [[CALL1]], [[BB67]] ]
+; CHECK-NEXT:    [[LOAD74:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD75:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR76:%.*]] = getelementptr i8, ptr [[PHI72]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR77:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12
+; CHECK-NEXT:    [[GETELEMENTPTR78:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR76]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD79:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR80:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR77]], i64 0, i64 0
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[LOAD81:%.*]] = load i8, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR82:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 8
+; CHECK-NEXT:    [[LOAD83:%.*]] = load i16, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i16 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    store i8 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR84:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 12
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR85:%.*]] = getelementptr i8, ptr [[GETELEMENTPTR57]], i64 16
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB53]]
+; CHECK:       bb86:
+; CHECK-NEXT:    [[GETELEMENTPTR87:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR34]], i64 0, i64 0
+; CHECK-NEXT:    [[LOAD88:%.*]] = load i32, ptr [[CALL1]], align 4
+; CHECK-NEXT:    [[GETELEMENTPTR89:%.*]] = getelementptr [0 x i32], ptr [[GETELEMENTPTR35]], i64 0, i64 0
+; CHECK-NEXT:    store i32 0, ptr [[CALL1]], align 4
+; CHECK-NEXT:    br label [[BB86]]
+;
+bb:
+  %call = tail call ptr @malloc(i64 1)
+  tail call void @llvm.memset.p0.i64(ptr %call, i8 0, i64 1, i1 false)
+  %call1 = tail call noalias ptr @malloc(i64 0)
+  store ptr %call1, ptr %a, align 8
+  %load = load ptr, ptr %a, align 8
+  %load2 = load i32, ptr %load, align 8
+  %getelementptr = getelementptr i8, ptr %call1, i64 0
+  %getelementptr3 = getelementptr i8, ptr %getelementptr, i64 1
+  store i8 0, ptr %call1, align 1
+  %getelementptr4 = getelementptr i8, ptr %getelementptr, i64 8
+  store i16 0, ptr %getelementptr4, align 4
+  %getelementptr5 = getelementptr i8, ptr %getelementptr, i64 12
+  store i32 0, ptr %call1, align 4
+  %load6 = load i32, ptr inttoptr (i64 4 to ptr), align 4
+  br label %bb48
+
+bb7:                                              ; No predecessors!
+  br label %bb9
+
+bb8:                                              ; No predecessors!
+  br label %bb53
+
+bb9:                                              ; preds = %bb43, %bb7
+  %phi = phi ptr [ %call1, %bb7 ], [ %a, %bb43 ]
+  %getelementptr10 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr11 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr12 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr13 = getelementptr i8, ptr %getelementptr12, i64 1
+  store i8 0, ptr %call1, align 1
+  br label %bb29
+
+bb14:                                             ; No predecessors!
+  %getelementptr15 = getelementptr i8, ptr %getelementptr10, i64 8
+  %load16 = load i16, ptr %call1, align 4
+  br i1 false, label %bb22, label %bb17
+
+bb17:                                             ; preds = %bb14
+  %getelementptr18 = getelementptr i8, ptr %getelementptr11, i64 8
+  %load19 = load i16, ptr %call1, align 4
+  %getelementptr20 = getelementptr i8, ptr %getelementptr12, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr21 = getelementptr i8, ptr %phi, i64 0
+  br label %bb25
+
+bb22:                                             ; preds = %bb14
+  %getelementptr23 = getelementptr i8, ptr %phi, i64 0
+  %getelementptr24 = getelementptr i8, ptr %getelementptr23, i64 12
+  br label %bb25
+
+bb25:                                             ; preds = %bb22, %bb17
+  %phi26 = phi ptr [ %a, %bb17 ], [ %call1, %bb22 ]
+  %phi27 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ]
+  %phi28 = phi ptr [ %call1, %bb17 ], [ %call1, %bb22 ]
+  store i32 0, ptr %call1, align 4
+  br label %bb29
+
+bb29:                                             ; preds = %bb25, %bb9
+  %phi30 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ]
+  %phi31 = phi ptr [ %call1, %bb9 ], [ %call1, %bb25 ]
+  %load32 = load i8, ptr %call1, align 4
+  %load33 = load i8, ptr %call1, align 4
+  %getelementptr34 = getelementptr i8, ptr %phi31, i64 12
+  %getelementptr35 = getelementptr i8, ptr %getelementptr12, i64 12
+  br label %bb86
+
+bb36:                                             ; No predecessors!
+  %getelementptr37 = getelementptr i8, ptr %phi30, i64 12
+  br label %bb38
+
+bb38:                                             ; preds = %bb38, %bb36
+  %getelementptr39 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0
+  %load40 = load i32, ptr %call1, align 4
+  %getelementptr41 = getelementptr [0 x i32], ptr %getelementptr37, i64 0, i64 0
+  %load42 = load i32, ptr %call1, align 4
+  br label %bb38
+
+bb43:                                             ; No predecessors!
+  %getelementptr44 = getelementptr i8, ptr %getelementptr12, i64 8
+  %load45 = load i16, ptr %call1, align 4
+  store i16 0, ptr %call1, align 4
+  store i8 0, ptr %call1, align 4
+  %getelementptr46 = getelementptr i8, ptr %getelementptr12, i64 12
+  store i32 0, ptr %call1, align 4
+  %getelementptr47 = getelementptr i8, ptr %getelementptr12, i64 16
+  store i32 0, ptr %call1, align 4
+  br label %bb9
+
+bb48:                                             ; preds = %bb48, %bb
+  %getelementptr49 = getelementptr i8, ptr %call1, i64 0
+  %getelementptr50 = getelementptr i8, ptr %getelementptr49, i64 1
+  store i8 0, ptr %call1, align 1
+  %getelementptr51 = getelementptr i8, ptr %getelementptr49, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr52 = getelementptr i8, ptr %getelementptr49, i64 12
+  store i32 0, ptr %call1, align 4
+  br label %bb48
+
+bb53:                                             ; preds = %bb71, %bb8
+  %phi54 = phi ptr [ %call1, %bb8 ], [ %a, %bb71 ]
+  %getelementptr55 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr56 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr57 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr58 = getelementptr i8, ptr %getelementptr57, i64 1
+  br label %bb71
+
+bb59:                                             ; No predecessors!
+  %getelementptr60 = getelementptr i8, ptr %phi54, i64 0
+  %getelementptr61 = getelementptr i8, ptr %getelementptr60, i64 12
+  br label %bb67
+
+bb62:                                             ; No predecessors!
+  %getelementptr63 = getelementptr i8, ptr %getelementptr56, i64 8
+  %load64 = load i16, ptr %call1, align 4
+  %getelementptr65 = getelementptr i8, ptr %getelementptr57, i64 8
+  store i16 0, ptr %call1, align 4
+  %getelementptr66 = getelementptr i8, ptr %phi54, i64 0
+  br label %bb67
+
+bb67:                                             ; preds = %bb62, %bb59
+  %phi68 = phi ptr [ %a, %bb62 ], [ %call1, %bb59 ]
+  %phi69 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ]
+  %phi70 = phi ptr [ %call1, %bb62 ], [ %call1, %bb59 ]
+  store i32 0, ptr %call1, align 4
+  br label %bb71
+
+bb71:                                             ; preds = %bb67, %bb53
+  %phi72 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ]
+  %phi73 = phi ptr [ %call1, %bb53 ], [ %call1, %bb67 ]
+  %load74 = load i8, ptr %call1, align 4
+  %load75 = load i8, ptr %call1, align 4
+  %getelementptr76 = getelementptr i8, ptr %phi72, i64 12
+  %getelementptr77 = getelementptr i8, ptr %getelementptr57, i64 12
+  %getelementptr78 = getelementptr [0 x i32], ptr %getelementptr76, i64 0, i64 0
+  %load79 = load i32, ptr %call1, align 4
+  %getelementptr80 = getelementptr [0 x i32], ptr %getelementptr77, i64 0, i64 0
+  store i32 0, ptr %call1, align 4
+  %load81 = load i8, ptr %call1, align 4
+  %getelementptr82 = getelementptr i8, ptr %getelementptr57, i64 8
+  %load83 = load i16, ptr %call1, align 4
+  store i16 0, ptr %call1, align 4
+  store i8 0, ptr %call1, align 4
+  %getelementptr84 = getelementptr i8, ptr %getelementptr57, i64 12
+  store i32 0, ptr %call1, align 4
+  %getelementptr85 = getelementptr i8, ptr %getelementptr57, i64 16
+  store i32 0, ptr %call1, align 4
+  br label %bb53
+
+bb86:                                             ; preds = %bb86, %bb29
+  %getelementptr87 = getelementptr [0 x i32], ptr %getelementptr34, i64 0, i64 0
+  %load88 = load i32, ptr %call1, align 4
+  %getelementptr89 = getelementptr [0 x i32], ptr %getelementptr35, i64 0, i64 0
+  store i32 0, ptr %call1, align 4
+  br label %bb86
+}
+
+declare ptr @malloc(i64)
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: write) }

From fcc33dca02d1f22d3dad5c4558ddff4926aef9d9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 10 Mar 2024 16:23:51 +0000
Subject: [PATCH 32/46] [X86] combineAndShuffleNot - ensure the type is legal
 before create X86ISD::ANDNP target nodes

Fixes #84660

(cherry picked from commit 862c7e0218f27b55a5b75ae59a4f73cd4610448d)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +++++++++--
 llvm/test/CodeGen/X86/combine-and.ll    | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a071c5a3ca032..9e64726fb6fff 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47878,6 +47878,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
   SDValue X, Y;
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (SDValue Not = GetNot(N0)) {
     X = Not;
@@ -47891,9 +47892,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
   X = DAG.getBitcast(VT, X);
   Y = DAG.getBitcast(VT, Y);
   SDLoc DL(N);
+
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+      TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
     SDValue LoY, HiY;
@@ -47903,7 +47906,11 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
     SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
   }
-  return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
+
+  if (TLI.isTypeLegal(VT))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
+
+  return SDValue();
 }
 
 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index d223b75419ac4..294fcd6a9563e 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -1171,6 +1171,25 @@ define <4 x i32> @neg_scalar_broadcast_two_uses(i32 %a0, <4 x i32> %a1, ptr %a2)
   ret <4 x i32> %4
 }
 
+; PR84660 - check for illegal types
+define <2 x i128> @neg_scalar_broadcast_illegaltype(i128 %arg) {
+; CHECK-LABEL: neg_scalar_broadcast_illegaltype:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    notl %esi
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    movq %rsi, 16(%rdi)
+; CHECK-NEXT:    movq %rsi, (%rdi)
+; CHECK-NEXT:    movq $0, 24(%rdi)
+; CHECK-NEXT:    movq $0, 8(%rdi)
+; CHECK-NEXT:    retq
+  %i = xor i128 %arg, 1
+  %i1 = insertelement <2 x i128> zeroinitializer, i128 %i, i64 0
+  %i2 = shufflevector <2 x i128> %i1, <2 x i128> zeroinitializer, <2 x i32> zeroinitializer
+  %i3 = and <2 x i128> <i128 1, i128 1>, %i2
+  ret <2 x i128> %i3
+}
+
 define <2 x i64> @andnp_xx(<2 x i64> %v0) nounwind {
 ; SSE-LABEL: andnp_xx:
 ; SSE:       # %bb.0:

From 2fc8bea42f992901cf4dbbe8b62c3383b2eb0288 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 12 Mar 2024 00:03:26 +0200
Subject: [PATCH 33/46] [LLD] [COFF] Set the right alignment for
 DelayDirectoryChunk (#84697)

This makes a difference when linking executables with delay loaded
libraries for arm32; the delay loader implementation can load data from
the registry with instructions that assume alignment.

This issue does not show up when linking in MinGW mode, because a
PseudoRelocTableChunk gets injected, which also sets alignment, even if
the chunk itself is empty.

(cherry picked from commit c93c76b562784926b22a69d3f82a5032dcb4a274)
---
 lld/COFF/DLL.cpp                      |  2 +-
 lld/test/COFF/delayimports-armnt.yaml | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 6b516d8c6d5ef..c4388ba9e40d0 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -172,7 +172,7 @@ binImports(COFFLinkerContext &ctx,
 // A chunk for the delay import descriptor table etnry.
 class DelayDirectoryChunk : public NonSectionChunk {
 public:
-  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) {}
+  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) { setAlignment(4); }
 
   size_t getSize() const override {
     return sizeof(delay_import_directory_table_entry);
diff --git a/lld/test/COFF/delayimports-armnt.yaml b/lld/test/COFF/delayimports-armnt.yaml
index 7d9bc38c5c360..ea96d864ef53d 100644
--- a/lld/test/COFF/delayimports-armnt.yaml
+++ b/lld/test/COFF/delayimports-armnt.yaml
@@ -6,6 +6,7 @@
 # RUN: llvm-readobj --coff-imports %t.exe | FileCheck -check-prefix=IMPORT %s
 # RUN: llvm-readobj --coff-basereloc %t.exe | FileCheck -check-prefix=BASEREL %s
 # RUN: llvm-objdump --no-print-imm-hex -d %t.exe | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=DIR %s
 
 # IMPORT:      Format: COFF-ARM
 # IMPORT-NEXT: Arch: thumb
@@ -13,9 +14,9 @@
 # IMPORT-NEXT: DelayImport {
 # IMPORT-NEXT:   Name: library.dll
 # IMPORT-NEXT:   Attributes: 0x1
-# IMPORT-NEXT:   ModuleHandle: 0x3000
-# IMPORT-NEXT:   ImportAddressTable: 0x3008
-# IMPORT-NEXT:   ImportNameTable: 0x2040
+# IMPORT-NEXT:   ModuleHandle: 0x3008
+# IMPORT-NEXT:   ImportAddressTable: 0x3010
+# IMPORT-NEXT:   ImportNameTable: 0x2044
 # IMPORT-NEXT:   BoundDelayImportTable: 0x0
 # IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 # IMPORT-NEXT:   Import {
@@ -43,7 +44,7 @@
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: HIGHLOW
-# BASEREL-NEXT:     Address: 0x3008
+# BASEREL-NEXT:     Address: 0x3010
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: ABSOLUTE
@@ -52,20 +53,24 @@
 # BASEREL-NEXT: ]
 #
 # DISASM:    00401000 <.text>:
-# DISASM:      40100c:       f243 0c08       movw r12, #12296
+# DISASM:      40100c:       f243 0c10       movw r12, #12304
 # DISASM-NEXT:               f2c0 0c40       movt    r12, #64
 # DISASM-NEXT:               f000 b800       b.w     {{.+}} @ imm = #0
 # DISASM-NEXT:               e92d 480f       push.w  {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               f20d 0b10       addw    r11, sp, #16
 # DISASM-NEXT:               ed2d 0b10       vpush   {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               4661            mov     r1, r12
-# DISASM-NEXT:               f242 0000       movw r0, #8192
+# DISASM-NEXT:               f242 0004       movw r0, #8196
 # DISASM-NEXT:               f2c0 0040       movt    r0, #64
 # DISASM-NEXT:               f7ff ffe7       bl      0x401000 <.text>
 # DISASM-NEXT:               4684            mov     r12, r0
 # DISASM-NEXT:               ecbd 0b10       vpop    {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               e8bd 480f       pop.w   {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               4760            bx      r12
+#
+# DIR:         DelayImportDescriptorRVA: 0x2004
+# DIR-NEXT:    DelayImportDescriptorSize: 0x40
+
 
 --- !COFF
 header:
@@ -80,6 +85,14 @@ sections:
       - VirtualAddress:  0
         SymbolName:      __imp_function
         Type:            IMAGE_REL_ARM_MOV32T
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     01
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       1
+    SectionData:     02
 symbols:
   - Name:            .text
     Value:           0

From 25a989ce8bf35ccda064d956305f920bf711a7de Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 11 Mar 2024 21:06:03 +0000
Subject: [PATCH 34/46] [ArgPromotion] Add test case for #84807.

Test case for https://github.com/llvm/llvm-project/issues/84807,
showing a mis-compile in ArgPromotion.

(cherry picked from commit 31ffdb56b4df9b772d763dccabbfde542545d695)
---
 ...ing-and-non-aliasing-loads-with-clobber.ll | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll

diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
new file mode 100644
index 0000000000000..69385a7ea51a7
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p argpromotion -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+@f = dso_local global { i16, i64 } { i16 1, i64 0 }, align 8
+
+; Test case for https://github.com/llvm/llvm-project/issues/84807.
+
+; FIXME: Currently the loads from @callee are moved to @caller, even though
+;        the store in %then may aliases to load from %q.
+
+define i32 @caller1(i1 %c) {
+; CHECK-LABEL: define i32 @caller1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[F_VAL:%.*]] = load i16, ptr @f, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr @f, i64 8
+; CHECK-NEXT:    [[F_VAL1:%.*]] = load i64, ptr [[TMP0]], align 8
+; CHECK-NEXT:    call void @callee1(i16 [[F_VAL]], i64 [[F_VAL1]], i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee1(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee1(
+; CHECK-SAME: i16 [[Q_0_VAL:%.*]], i64 [[Q_8_VAL:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+
+  uselistorder ptr %q, { 1, 0 }
+}
+
+; Same as @caller1/callee2, but with default uselist order.
+define i32 @caller2(i1 %c) {
+; CHECK-LABEL: define i32 @caller2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee2(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee2(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee2(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee2(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+}
+
+declare void @use(i16, i64)

From 7b61ddefc28a2c88be3a754ceee7bace98e3b187 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 12 Mar 2024 09:47:42 +0000
Subject: [PATCH 35/46] [ArgPromotion] Remove incorrect TranspBlocks set for
 loads. (#84835)

The TranspBlocks set was used to cache aliasing decision for all
processed loads in the parent loop. This is incorrect, because each load
can access a different location, which means one load not being modified
in a block doesn't translate to another load not being modified in the
same block.

All loads access the same underlying object, so we could perhaps use a
location without size for all loads and retain the cache, but that would
mean we loose precision.

For now, just drop the cache.

Fixes https://github.com/llvm/llvm-project/issues/84807

PR: https://github.com/llvm/llvm-project/pull/84835
(cherry picked from commit bba4a1daff6ee09941f1369a4e56b4af95efdc5c)
---
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp      |  6 +-----
 ...aliasing-and-non-aliasing-loads-with-clobber.ll | 14 +++++++-------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 8058282c42250..062a3d341007c 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -652,10 +652,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   // check to see if the pointer is guaranteed to not be modified from entry of
   // the function to each of the load instructions.
 
-  // Because there could be several/many load instructions, remember which
-  // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
-
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -669,7 +665,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
     for (BasicBlock *P : predecessors(BB)) {
-      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+      for (BasicBlock *TranspBB : inverse_depth_first(P))
         if (AAR.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
index 69385a7ea51a7..1e1669b29b0db 100644
--- a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -7,17 +7,14 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:
 
 ; Test case for https://github.com/llvm/llvm-project/issues/84807.
 
-; FIXME: Currently the loads from @callee are moved to @caller, even though
-;        the store in %then may aliases to load from %q.
+; Make sure the loads from @callee are not moved to @caller, as the store
+; in %then may aliases to load from %q.
 
 define i32 @caller1(i1 %c) {
 ; CHECK-LABEL: define i32 @caller1(
 ; CHECK-SAME: i1 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[F_VAL:%.*]] = load i16, ptr @f, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr @f, i64 8
-; CHECK-NEXT:    [[F_VAL1:%.*]] = load i64, ptr [[TMP0]], align 8
-; CHECK-NEXT:    call void @callee1(i16 [[F_VAL]], i64 [[F_VAL1]], i1 [[C]])
+; CHECK-NEXT:    call void @callee1(ptr noundef nonnull @f, i1 [[C]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -27,13 +24,16 @@ entry:
 
 define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
 ; CHECK-LABEL: define internal void @callee1(
-; CHECK-SAME: i16 [[Q_0_VAL:%.*]], i64 [[Q_8_VAL:%.*]], i1 [[C:%.*]]) {
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    store i16 123, ptr @f, align 8
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
 ; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
 ; CHECK-NEXT:    ret void
 ;

From 159969b3880b89fdd6ee262ccee6c74a5c79607a Mon Sep 17 00:00:00 2001
From: azhan92 <alisonxzhang@gmail.com>
Date: Thu, 15 Feb 2024 21:27:45 -0500
Subject: [PATCH 36/46] [Release] Install compiler-rt builtins during Phase 1
 on AIX (#81485)

The current test-release.sh script does not install the necessary
compiler-rt builtin's during Phase 1 on AIX, resulting on a
non-functional Phase 1 clang. Futhermore, the installation is also
necessary for Phase 2 on AIX.

Co-authored-by: Alison Zhang <alisonzhang@ibm.com>
(cherry picked from commit 3af5c98200e0b1268f755c3f289be4f73aac4214)
---
 llvm/utils/release/test-release.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 0af16387ce1d8..4314b565e11b0 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -532,9 +532,9 @@ function build_llvmCore() {
       BuildTarget="clang"
       InstallTarget="install-clang install-clang-resource-headers"
       # compiler-rt builtins is needed on AIX to have a functional Phase 1 clang.
-      if [ "$System" = "AIX" -o "$Phase" != "1" ]; then
+      if [ "$System" = "AIX" ]; then
         BuildTarget="$BuildTarget runtimes"
-        InstallTarget="$InstallTarget install-runtimes"
+        InstallTarget="$InstallTarget install-builtins"
       fi
     fi
     if [ "$Phase" -eq "3" ]; then

From b01c3dcf2eb5bdadd0df30e9ff5160f2da17293f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 7 Mar 2024 13:53:02 +0000
Subject: [PATCH 37/46] [LAA] Add test case for #82665.

Test case for https://github.com/llvm/llvm-project/issues/82665.

(cherry picked from commit 4cfd4a7896b5fd50274ec8573c259d7ad41741de)
---
 .../underlying-object-loop-varying-phi.ll     | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
new file mode 100644
index 0000000000000..1a5a6ac08d404
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test case for https://github.com/llvm/llvm-project/issues/82665.
+define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4, !tbaa !10
+  %xor = xor i32 %l, 1
+  store i32 %xor, ptr %ptr.recur, align 4, !tbaa !10
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define i32 @indirect_ptr_recurrences_read_only_loop(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_only_loop'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %xor, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4, !tbaa !10
+  %xor = xor i32 %l, 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %xor
+}
+
+define void @indirect_ptr_recurrences_read_write_may_alias_no_tbaa(ptr %A, ptr %B) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_no_tbaa'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  store i32 %xor, ptr %ptr.recur, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @indirect_ptr_recurrences_read_write_may_alias_different_obj(ptr %A, ptr %B, ptr %C) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_alias_different_obj'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv
+  store i32 %xor, ptr %gep.C, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @indirect_ptr_recurrences_read_write_may_noalias_different_obj(ptr %A, ptr %B, ptr noalias %C) {
+; CHECK-LABEL: 'indirect_ptr_recurrences_read_write_may_noalias_different_obj'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop ]
+  %ptr.recur = phi ptr [ %A, %entry ], [ %ptr.next, %loop ]
+  %gep.B = getelementptr inbounds ptr, ptr %B, i64 %iv
+  %ptr.next = load ptr, ptr %gep.B, align 8, !tbaa !6
+  %l = load i32, ptr %ptr.recur, align 4
+  %xor = xor i32 %l, 1
+  %gep.C = getelementptr inbounds ptr, ptr %C, i64 %iv
+  store i32 %xor, ptr %gep.C, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 5
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+!6 = !{!7, !7, i64 0}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !8, i64 0}

From c7eb919d2cbef765e058c977b3ab0801b6a89b66 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 12 Mar 2024 08:55:03 +0000
Subject: [PATCH 38/46] [ValueTracking] Treat phi as underlying obj when not
 decomposing further (#84339)

At the moment, getUnderlyingObjects simply continues for phis that do
not refer to the same underlying object in loops, without adding them to
the list of underlying objects, effectively ignoring those phis.

Instead of ignoring those phis, add them to the list of underlying
objects. This fixes a miscompile where LoopAccessAnalysis fails to
identify a memory dependence, because no underlying objects can be found
for a set of memory accesses.

Fixes https://github.com/llvm/llvm-project/issues/82665.

PR: https://github.com/llvm/llvm-project/pull/84339
(cherry picked from commit b274b23665dec30f3ae4fb83ccca8b77e6d3ada3)
---
 llvm/lib/Analysis/ValueTracking.cpp                        | 2 ++
 .../underlying-object-loop-varying-phi.ll                  | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 412115eb649c2..9f9451e4e814a 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5986,6 +5986,8 @@ void llvm::getUnderlyingObjects(const Value *V,
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         append_range(Worklist, PN->incoming_values());
+      else
+        Objects.push_back(P);
       continue;
     }
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
index 1a5a6ac08d404..106dc8c13a49f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
@@ -7,8 +7,13 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'indirect_ptr_recurrences_read_write'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndidrectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %ptr.recur, align 4, !tbaa !4 ->
+; CHECK-NEXT:            store i32 %xor, ptr %ptr.recur, align 4, !tbaa !4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:

From 38cf35dee880e9e8545e7c2997201ae28f3a6738 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 4 Mar 2024 11:32:07 +0100
Subject: [PATCH 39/46] [Inline] Add test for #67054 (NFC)

(cherry picked from commit cad6ad2759a782c48193f83886488dacc9f330e3)
---
 .../Inline/X86/call-abi-compatibility.ll      | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
index 3a30980fe31bd..f03270bafea99 100644
--- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
@@ -93,3 +93,34 @@ define internal void @caller_not_avx4() {
 }
 
 declare i64 @caller_unknown_simple(i64)
+
+; FIXME: This call should get inlined, because the callee only contains
+; inline ASM, not real calls.
+define <8 x i64> @caller_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #0 {
+; CHECK-LABEL: define {{[^@]+}}@caller_inline_asm
+; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call <8 x i64> @callee_inline_asm(ptr [[P0]], i64 [[K]], ptr [[P1]], ptr [[P2]])
+; CHECK-NEXT:    ret <8 x i64> [[CALL]]
+;
+  %call = call <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2)
+  ret <8 x i64> %call
+}
+
+define internal <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #1 {
+; CHECK-LABEL: define {{[^@]+}}@callee_inline_asm
+; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[SRC:%.*]] = load <8 x i64>, ptr [[P0]], align 64
+; CHECK-NEXT:    [[A:%.*]] = load <8 x i64>, ptr [[P1]], align 64
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[P2]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A]], <8 x i64> [[B]], <8 x i64> [[SRC]])
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+;
+  %src = load <8 x i64>, ptr %p0, align 64
+  %a = load <8 x i64>, ptr %p1, align 64
+  %b = load <8 x i64>, ptr %p2, align 64
+  %3 = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 %k, <8 x i64> %a, <8 x i64> %b, <8 x i64> %src) #2
+  ret <8 x i64> %3
+}
+
+attributes #0 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+attributes #1 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }

From 8c6015db5912dee1cce0e900b6abe5735be09b83 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 5 Mar 2024 14:21:33 +0100
Subject: [PATCH 40/46] [X86][Inline] Skip inline asm in inlining target
 feature check (#83820)

When inlining across functions with different target features, we
perform roughly two checks:
 1. The caller features must be a superset of the callee features.
2. Calls in the callee cannot use types where the target features would
change the call ABI (e.g. by changing whether something is passed in a
zmm or two ymm registers). The latter check is very crude right now.

The latter check currently also catches inline asm "calls". I believe
that inline asm should be excluded from this check, as it is independent
from the usual call ABI, and instead governed by the inline asm
constraint string.

Fixes https://github.com/llvm/llvm-project/issues/67054.

(cherry picked from commit e84182af919d136d74b75ded4d599b38fb47dfb0)
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp  |  4 ++++
 .../Inline/X86/call-abi-compatibility.ll        | 17 ++++++-----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cd40b1d3b0933..be774a89eccbb 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6080,6 +6080,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
 
   for (const Instruction &I : instructions(Callee)) {
     if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      // Having more target features is fine for inline ASM.
+      if (CB->isInlineAsm())
+        continue;
+
       SmallVector<Type *, 8> Types;
       for (Value *Arg : CB->args())
         Types.push_back(Arg->getType());
diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
index f03270bafea99..6f582cab2f145 100644
--- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
@@ -94,27 +94,22 @@ define internal void @caller_not_avx4() {
 
 declare i64 @caller_unknown_simple(i64)
 
-; FIXME: This call should get inlined, because the callee only contains
+; This call should get inlined, because the callee only contains
 ; inline ASM, not real calls.
 define <8 x i64> @caller_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@caller_inline_asm
 ; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[CALL:%.*]] = call <8 x i64> @callee_inline_asm(ptr [[P0]], i64 [[K]], ptr [[P1]], ptr [[P2]])
-; CHECK-NEXT:    ret <8 x i64> [[CALL]]
+; CHECK-NEXT:    [[SRC_I:%.*]] = load <8 x i64>, ptr [[P0]], align 64
+; CHECK-NEXT:    [[A_I:%.*]] = load <8 x i64>, ptr [[P1]], align 64
+; CHECK-NEXT:    [[B_I:%.*]] = load <8 x i64>, ptr [[P2]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A_I]], <8 x i64> [[B_I]], <8 x i64> [[SRC_I]])
+; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
 ;
   %call = call <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2)
   ret <8 x i64> %call
 }
 
 define internal <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #1 {
-; CHECK-LABEL: define {{[^@]+}}@callee_inline_asm
-; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:    [[SRC:%.*]] = load <8 x i64>, ptr [[P0]], align 64
-; CHECK-NEXT:    [[A:%.*]] = load <8 x i64>, ptr [[P1]], align 64
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[P2]], align 64
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A]], <8 x i64> [[B]], <8 x i64> [[SRC]])
-; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
-;
   %src = load <8 x i64>, ptr %p0, align 64
   %a = load <8 x i64>, ptr %p1, align 64
   %b = load <8 x i64>, ptr %p2, align 64

From 33c6b2027698eebfaeda1703ecd2ad0210618183 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Wed, 13 Mar 2024 19:27:21 -0400
Subject: [PATCH 41/46] SystemZ release notes for 18.x. (#84560)

---
 clang/docs/ReleaseNotes.rst  |  5 +++++
 lld/docs/ReleaseNotes.rst    |  5 +++++
 llvm/docs/ReleaseNotes.rst   | 11 +++++++++++
 openmp/docs/ReleaseNotes.rst |  2 ++
 4 files changed, 23 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 101b3a54b9af2..d4401a43c123e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1327,6 +1327,11 @@ AIX Support
   or newer. Similar to the LTO support on AIX, ThinLTO is implemented with
   the libLTO.so plugin.
 
+SystemZ Support
+^^^^^^^^^^^^^^^
+- Properly support 16 byte atomic int/fp types and ops. Atomic __int128 (and
+  long double) variables are now aligned to 16 bytes by default (like gcc 14).
+
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 56ba3463aeadc..6ada711a20a6d 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -163,5 +163,10 @@ WebAssembly Improvements
   is read from object files within the archive.  This matches the behaviour of
   the ELF linker.
 
+SystemZ
+-------
+
+* Add target support for SystemZ (s390x).
+
 Fixes
 #####
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index bfa8e93da05cb..ecfcd2c983ce5 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -215,6 +215,17 @@ Changes to the RISC-V Backend
 * ``-mcpu=sifive-p670`` was added.
 * Support for the Zicond extension is no longer experimental.
 
+Changes to the SystemZ Backend
+------------------------------
+
+* Properly support 16 byte atomic int/fp types and ops.
+* Support i128 as legal type in VRs.
+* Add an i128 cost model.
+* Support building individual functions with backchain using the
+  __attribute__((target("backchain"))) syntax.
+* Add exception handling for XPLINK.
+* Add support for llvm-objcopy.
+
 Changes to the WebAssembly Backend
 ----------------------------------
 
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index 3eeaf5c900d80..a5b39f61b0b64 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -19,3 +19,5 @@ from the `LLVM releases web site <https://llvm.org/releases/>`_.
 
 Non-comprehensive list of changes in this release
 =================================================
+
+* SystemZ support added.

From 122ba9f100705213774cff2038db953ff8174d91 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 14 Mar 2024 09:51:27 -0700
Subject: [PATCH 42/46] [ELF] Eliminate symbols demoted due to /DISCARD/
 discarded sections (#85167)

#69295 demoted Defined symbols relative to discarded sections.
If such a symbol is unreferenced, the desired behavior is to
eliminate it from .symtab just like --gc-sections discarded
definitions.
Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects
that the unreferenced `unused` is not emitted to .symtab
(https://github.com/ClangBuiltLinux/linux/issues/2006).

For relocations referencing demoted symbols, the symbol index restores
to 0 like older lld (`R_X86_64_64 0` in `discard-section.s`).

Fix #85048

(cherry picked from commit 8fe3e70e810b409dce36f6d415e86f0f9b1cf22d)
---
 lld/ELF/Writer.cpp                          |  3 +++
 lld/test/ELF/linkerscript/discard-section.s | 25 +++++++++++++--------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6df43a34be013..8a08b0fcc90db 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -261,6 +261,9 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
   Undefined(sym.file, sym.getName(), binding, sym.stOther, sym.type,
             /*discardedSecIdx=*/map.lookup(sym.section))
       .overwrite(sym);
+  // Eliminate from the symbol table, otherwise we would leave an undefined
+  // symbol if the symbol is unreferenced in the absence of GC.
+  sym.isUsedInRegularObj = false;
 }
 
 // If all references to a DSO happen to be weak, the DSO is not added to
diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s
index 24f3b2b73e991..0bbebac59bb34 100644
--- a/lld/test/ELF/linkerscript/discard-section.s
+++ b/lld/test/ELF/linkerscript/discard-section.s
@@ -9,6 +9,9 @@
 # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning:
 # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC
 
+# RUN: ld.lld -r --gc-sections -T a.lds a.o b.o -o a.gc.ro --no-fatal-warnings
+# RUN: llvm-readelf -r -s a.gc.ro | FileCheck %s --check-prefix=RELOC-GC
+
 # LOCAL:      error: relocation refers to a discarded section: .aaa
 # LOCAL-NEXT: >>> defined in a.o
 # LOCAL-NEXT: >>> referenced by a.o:(.bbb+0x0)
@@ -32,16 +35,18 @@
 # WARNING:      warning: relocation refers to a discarded section: .aaa
 # WARNING-NEXT: >>> referenced by a.o:(.rela.bbb+0x0)
 
+## GNU ld reports "defined in discarded secion" errors even in -r mode.
+## We set the symbol index to 0.
 # RELOC:      Relocation section '.rela.bbb' at offset {{.*}} contains 1 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
 # RELOC-NEXT: 0000000000000000  0000000000000000 R_X86_64_NONE                             0
 # RELOC-EMPTY:
 # RELOC-NEXT: Relocation section '.rela.data' at offset {{.*}} contains 4 entries:
 # RELOC-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-# RELOC-NEXT: 0000000000000000  0000000500000001 R_X86_64_64            0000000000000000 global + 0
-# RELOC-NEXT: 0000000000000008  0000000700000001 R_X86_64_64            0000000000000000 weak + 0
-# RELOC-NEXT: 0000000000000010  0000000600000001 R_X86_64_64            0000000000000000 weakref1 + 0
-# RELOC-NEXT: 0000000000000018  0000000800000001 R_X86_64_64            0000000000000000 weakref2 + 0
+# RELOC-NEXT: 0000000000000000  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000008  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000010  0000000000000001 R_X86_64_64                             0
+# RELOC-NEXT: 0000000000000018  0000000000000001 R_X86_64_64                             0
 
 # RELOC:      Num:    Value          Size Type    Bind   Vis      Ndx Name
 # RELOC-NEXT:   0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
@@ -49,23 +54,25 @@
 # RELOC-NEXT:   2: 0000000000000000     0 SECTION LOCAL  DEFAULT    2 .bbb
 # RELOC-NEXT:   3: 0000000000000000     0 SECTION LOCAL  DEFAULT    4 .data
 # RELOC-NEXT:   4: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT    1 _start
-# RELOC-NEXT:   5: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND global
-# RELOC-NEXT:   6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref1
-# RELOC-NEXT:   7: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weak
-# RELOC-NEXT:   8: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND weakref2
 # RELOC-EMPTY:
 
+# RELOC-GC:   There are no relocations in this file.
+
 #--- a.s
 .globl _start
 _start:
 
 .section .aaa,"a"
-.globl global, weakref1
+.globl global, weakref1, unused
 .weak weak, weakref2
 global:
 weak:
 weakref1:
 weakref2:
+## Eliminate `unused` just like GC discarded definitions.
+## Linux kernel's CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y configuration expects
+## that the unreferenced `unused` is not emitted to .symtab.
+unused:
   .quad 0
 
 .section .bbb,"aw"

From bb83f055091c3e1a024811658f5b76925d3963ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Wed, 13 Mar 2024 22:01:01 +0200
Subject: [PATCH 43/46] [runtimes] Prefer
 -fvisibility-global-new-delete=force-hidden (#84917)

27ce26b06655cfece3d54b30e442ef93d3e78ac7 added the new option
-fvisibility-global-new-delete=, where -fvisibility-global-new-delete=force-hidden
is equivalent to the old option -fvisibility-global-new-delete-hidden.
At the same time, the old option was deprecated.

Test for and use the new option form first; if unsupported, try
using the old form.

This avoids warnings in the MinGW builds, if built with Clang 18 or
newer.

(cherry picked from commit 1f973efd335f34c75fcba1ccbe288fd5ece15a64)
---
 libcxx/src/CMakeLists.txt    | 5 ++++-
 libcxxabi/src/CMakeLists.txt | 5 ++++-
 libunwind/src/CMakeLists.txt | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 44a088663463c..1b80625304a41 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -306,7 +306,10 @@ if (LIBCXX_ENABLE_STATIC)
     # then its code shouldn't declare them with hidden visibility.  They might
     # actually be provided by a shared library at link time.
     if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
-      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden)
+      if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+        append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
+      endif()
     endif()
     target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS})
     # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 4198827203fc8..f4722c3b352d4 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -265,7 +265,10 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
   # then its code shouldn't declare them with hidden visibility.  They might
   # actually be provided by a shared library at link time.
   if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+    if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+      target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+    endif()
   endif()
   # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in libcxx's
   # __config_site too. Define it in the same way here, to avoid redefinition
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 9c6f5d908b094..780430ba70ba6 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -201,7 +201,10 @@ set_target_properties(unwind_static_objects
 
 if(LIBUNWIND_HIDE_SYMBOLS)
   target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility=hidden)
-  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete=force-hidden)
+  if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+    target_add_compile_flags_if_supported(unwind_static_objects PRIVATE -fvisibility-global-new-delete-hidden)
+  endif()
   target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS)
 endif()
 

From 600f7f2ba28f871a7e31a69252e51e01822572cd Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Fri, 15 Mar 2024 00:37:43 -0400
Subject: [PATCH 44/46] [clangd] Add clangd 18 release notes (#84436)

---
 clang-tools-extra/docs/ReleaseNotes.rst | 43 +++++++++++++++++++++----
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5758b5acbc0b5..8621444364fb2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -51,21 +51,35 @@ Improvements to clangd
 Inlay hints
 ^^^^^^^^^^^
 
-Diagnostics
-^^^^^^^^^^^
-
-Semantic Highlighting
-^^^^^^^^^^^^^^^^^^^^^
+- Type hints
+    * Improved heuristics for showing sugared vs. desguared types
+    * Some hints which provide no information (e.g. ``<dependent-type>``) are now omitted
+- Parameter hints
+    * Parameter hints are now shown for calls through function pointers
+    * Parameter hints are now shown for calls to a class's ``operator()``
+    * No longer show bogus parameter hints for some builtins like ``__builtin_dump_struct``
 
 Compile flags
 ^^^^^^^^^^^^^
 
+- System include extractor (``--query-driver``) improvements
+    * The directory containing builtin headers is now excluded from extracted system includes
+    * Various flags which can affect the system includes (``--target``, ``--stdlib``, ``-specs``) are now forwarded to the driver
+    * Fixed a bug where clangd would sometimes try to call a driver that didn't have obj-c support with ``-x objective-c++-header``
+    * The driver path is now dot-normalized before being compared to the ``--query-driver`` pattern
+    * ``--query-driver`` is now supported by ``clangd-indexer``
+- Fixed a regression in clangd 17 where response files would not be expanded
+
 Hover
 ^^^^^
 
+- Hover now shows alignment info for fields and records
+
 Code completion
 ^^^^^^^^^^^^^^^
 
+- Refined heuristics for determining whether the use of a function can be a call or not
+
 Code actions
 ^^^^^^^^^^^^
 
@@ -75,15 +89,25 @@ Code actions
 Signature help
 ^^^^^^^^^^^^^^
 
+- Improved support for calls through function pointer types
+
 Cross-references
 ^^^^^^^^^^^^^^^^
 
+- Improved support for C++20 concepts
+- Find-references now works for labels
+- Improvements to template heuristics
+
 Objective-C
 ^^^^^^^^^^^
 
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Various stability improvements, e.g. crash fixes
+- Improved error recovery on invalid code
+- Clangd now bails gracefully on assembly and IR source files
+
 Improvements to clang-doc
 -------------------------
 
@@ -564,10 +588,15 @@ Changes in existing checks
 Removed checks
 ^^^^^^^^^^^^^^
 
-Improvements to include-fixer
+Improvements to include-cleaner
 -----------------------------
 
-The improvements are...
+- Support for ``--only-headers`` flag to limit analysis to headers matching a regex
+- Recognizes references through ``concept``s
+- Builtin headers are not analyzed
+- Handling of references through ``friend`` declarations
+- Fixes around handling of IWYU pragmas on stdlib headers
+- Improved handling around references to/from template specializations
 
 Improvements to clang-include-fixer
 -----------------------------------

From 9b3edb592debc00a5c3fbf7a71f63e07d6af44be Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Fri, 15 Mar 2024 04:43:47 +0000
Subject: [PATCH 45/46] release/18.x: [openmp] __kmp_x86_cpuid fix for i386/PIC
 builds. (#84626) (#85053)

---
 openmp/runtime/src/kmp.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index e3a1e20731bbe..d51ec886cfe55 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1403,9 +1403,19 @@ extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 // subleaf is only needed for cache and topology discovery and can be set to
 // zero in most cases
 static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
+#if KMP_ARCH_X86 && (defined(__pic__) || defined(__PIC__))
+  // on i386 arch, the ebx reg. is used by pic, thus we need to preserve from
+  // being trashed beforehand
+  __asm__ __volatile__("mov %%ebx, %%edi\n"
+                       "cpuid\n"
+                       "xchg %%edi, %%ebx\n"
+                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
+                       : "a"(leaf), "c"(subleaf));
+#else
   __asm__ __volatile__("cpuid"
                        : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
                        : "a"(leaf), "c"(subleaf));
+#endif
 }
 // Load p into FPU control word
 static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {

From 12a3bf35157543ef73f4110215304bef55e79316 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 15 Mar 2024 08:15:53 -0700
Subject: [PATCH 46/46] workflows: Add workaround for lld failures on MacOS
 (#85021) (#85110)

See #81967

(cherry picked from commit 175b533720956017bb18d1280362f6890ee15b05)
---
 .github/workflows/llvm-project-tests.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index 43b90193406fc..a52dd2db8035d 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -118,6 +118,11 @@ jobs:
           else
             builddir="$(pwd)"/build
           fi
+          if [ "${{ runner.os }}" == "macOS" ]; then
+            # Workaround test failure on some lld tests on MacOS
+            # https://github.com/llvm/llvm-project/issues/81967
+            extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON"
+          fi
           echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT"
           cmake -G Ninja \
                 -B "$builddir" \